Build branch main with version main (173327cc)

Build pipeline: vsh-ci-build-template-k4qzr

Source commit: 173327cc56

Source message: Cellranger multi conversion: fix combined AB + CB probe experiments (#1062)
This commit is contained in:
CI
2025-08-22 08:50:18 +00:00
commit cd5554d22f
2226 changed files with 1154442 additions and 0 deletions

View File

@@ -0,0 +1,471 @@
name: "celltypist"
namespace: "annotate"
version: "main"
authors:
- name: "Jakub Majercik"
roles:
- "author"
info:
role: "Contributor"
links:
email: "jakub@data-intuitive.com"
github: "jakubmajercik"
linkedin: "jakubmajercik"
organizations:
- name: "Data Intuitive"
href: "https://www.data-intuitive.com"
role: "Bioinformatics Engineer"
- name: "Weiwei Schultz"
roles:
- "contributor"
info:
role: "Contributor"
organizations:
- name: "Janssen R&D US"
role: "Associate Director Data Sciences"
argument_groups:
- name: "Inputs"
description: "Input dataset (query) arguments"
arguments:
- type: "file"
name: "--input"
alternatives:
- "-i"
description: "The input (query) data to be labeled. Should be a .h5mu file."
info: null
example:
- "input.h5mu"
must_exist: true
create_parent: true
required: true
direction: "input"
multiple: false
multiple_sep: ";"
- type: "string"
name: "--modality"
description: "Which modality to process."
info: null
default:
- "rna"
required: false
direction: "input"
multiple: false
multiple_sep: ";"
- type: "string"
name: "--input_layer"
description: "The layer in the input data containing log normalized counts to\
\ be used for cell type annotation if .X is not to be used."
info: null
required: false
direction: "input"
multiple: false
multiple_sep: ";"
- type: "string"
name: "--input_var_gene_names"
description: "The name of the adata var column in the input data containing gene\
\ names; when no gene_name_layer is provided, the var index will be used.\n"
info: null
required: false
direction: "input"
multiple: false
multiple_sep: ";"
- type: "integer"
name: "--input_reference_gene_overlap"
description: "The minimum number of genes present in both the reference and query\
\ datasets.\n"
info: null
default:
- 100
required: false
min: 1
direction: "input"
multiple: false
multiple_sep: ";"
- name: "Reference"
description: "Arguments related to the reference dataset."
arguments:
- type: "file"
name: "--reference"
description: "The reference data to train the CellTypist classifiers on. Only\
\ required if a pre-trained --model is not provided."
info: null
example:
- "reference.h5mu"
must_exist: true
create_parent: true
required: false
direction: "input"
multiple: false
multiple_sep: ";"
- type: "string"
name: "--reference_layer"
description: "The layer in the reference data to be used for cell type annotation\
\ if .X is not to be used. Data are expected to be processed in the same way\
\ as the --input query dataset."
info: null
required: false
direction: "input"
multiple: false
multiple_sep: ";"
- type: "string"
name: "--reference_obs_target"
description: "The name of the adata obs column in the reference data containing\
\ cell type annotations."
info: null
default:
- "cell_ontology_class"
required: false
direction: "input"
multiple: false
multiple_sep: ";"
- type: "string"
name: "--reference_var_gene_names"
description: "The name of the adata var column in the reference data containing\
\ gene names; when no gene_name_layer is provided, the var index will be used.\n"
info: null
required: false
direction: "input"
multiple: false
multiple_sep: ";"
- type: "string"
name: "--reference_var_input"
description: ".var column containing highly variable genes. By default, do not\
\ subset genes.\n"
info: null
required: false
direction: "input"
multiple: false
multiple_sep: ";"
- name: "Model arguments"
description: "Model arguments."
arguments:
- type: "file"
name: "--model"
description: "Pretrained model in pkl format. If not provided, the model will\
\ be trained on the reference data and --reference should be provided."
info: null
example:
- "pretrained_model.pkl"
must_exist: true
create_parent: true
required: false
direction: "input"
multiple: false
multiple_sep: ";"
- type: "boolean"
name: "--feature_selection"
description: "Whether to perform feature selection."
info: null
default:
- false
required: false
direction: "input"
multiple: false
multiple_sep: ";"
- type: "boolean"
name: "--majority_voting"
description: "Whether to refine the predicted labels by running the majority voting\
\ classifier after over-clustering."
info: null
default:
- false
required: false
direction: "input"
multiple: false
multiple_sep: ";"
- type: "double"
name: "--C"
description: "Inverse of regularization strength in logistic regression."
info: null
default:
- 1.0
required: false
direction: "input"
multiple: false
multiple_sep: ";"
- type: "integer"
name: "--max_iter"
description: "Maximum number of iterations before reaching the minimum of the\
\ cost function."
info: null
default:
- 1000
required: false
direction: "input"
multiple: false
multiple_sep: ";"
- type: "boolean_true"
name: "--use_SGD"
description: "Whether to use the stochastic gradient descent algorithm."
info: null
direction: "input"
- type: "double"
name: "--min_prop"
description: "\"For the dominant cell type within a subcluster, the minimum proportion\
\ of cells required to \nsupport naming of the subcluster by this cell type.\
\ Ignored if majority_voting is set to False. \nSubcluster that fails to pass\
\ this proportion threshold will be assigned 'Heterogeneous'.\"\n"
info: null
default:
- 0.0
required: false
direction: "input"
multiple: false
multiple_sep: ";"
- name: "Outputs"
description: "Output arguments."
arguments:
- type: "file"
name: "--output"
description: "Output h5mu file."
info: null
example:
- "output.h5mu"
must_exist: true
create_parent: true
required: false
direction: "output"
multiple: false
multiple_sep: ";"
- type: "string"
name: "--output_obs_predictions"
description: "In which `.obs` slots to store the predicted information.\n"
info: null
default:
- "celltypist_pred"
required: false
direction: "input"
multiple: false
multiple_sep: ";"
- type: "string"
name: "--output_obs_probability"
description: "In which `.obs` slots to store the probability of the predictions.\n"
info: null
default:
- "celltypist_probability"
required: false
direction: "input"
multiple: false
multiple_sep: ";"
- type: "string"
name: "--output_compression"
description: "Compression format to use for the output AnnData and/or Mudata objects.\n\
By default no compression is applied.\n"
info: null
example:
- "gzip"
required: false
choices:
- "gzip"
- "lzf"
direction: "input"
multiple: false
multiple_sep: ";"
resources:
- type: "python_script"
path: "script.py"
is_executable: true
- type: "file"
path: "setup_logger.py"
- type: "file"
path: "cross_check_genes.py"
- type: "file"
path: "subset_vars.py"
- type: "file"
path: "set_var_index.py"
- type: "file"
path: "nextflow_labels.config"
dest: "nextflow_labels.config"
description: "Automated cell type annotation tool for scRNA-seq datasets on the basis\
\ of logistic regression classifiers optimised by the stochastic gradient descent\
\ algorithm."
test_resources:
- type: "python_script"
path: "test.py"
is_executable: true
- type: "file"
path: "annotation_test_data"
- type: "file"
path: "pbmc_1k_protein_v3"
info: null
status: "enabled"
scope:
image: "public"
target: "public"
license: "MIT"
links:
repository: "https://github.com/openpipelines-bio/openpipeline"
docker_registry: "ghcr.io"
runners:
- type: "executable"
id: "executable"
docker_setup_strategy: "ifneedbepullelsecachedbuild"
- type: "nextflow"
id: "nextflow"
directives:
label:
- "highcpu"
- "highmem"
- "highdisk"
tag: "$id"
auto:
simplifyInput: true
simplifyOutput: false
transcript: false
publish: false
config:
labels:
mem1gb: "memory = 1000000000.B"
mem2gb: "memory = 2000000000.B"
mem5gb: "memory = 5000000000.B"
mem10gb: "memory = 10000000000.B"
mem20gb: "memory = 20000000000.B"
mem50gb: "memory = 50000000000.B"
mem100gb: "memory = 100000000000.B"
mem200gb: "memory = 200000000000.B"
mem500gb: "memory = 500000000000.B"
mem1tb: "memory = 1000000000000.B"
mem2tb: "memory = 2000000000000.B"
mem5tb: "memory = 5000000000000.B"
mem10tb: "memory = 10000000000000.B"
mem20tb: "memory = 20000000000000.B"
mem50tb: "memory = 50000000000000.B"
mem100tb: "memory = 100000000000000.B"
mem200tb: "memory = 200000000000000.B"
mem500tb: "memory = 500000000000000.B"
mem1gib: "memory = 1073741824.B"
mem2gib: "memory = 2147483648.B"
mem4gib: "memory = 4294967296.B"
mem8gib: "memory = 8589934592.B"
mem16gib: "memory = 17179869184.B"
mem32gib: "memory = 34359738368.B"
mem64gib: "memory = 68719476736.B"
mem128gib: "memory = 137438953472.B"
mem256gib: "memory = 274877906944.B"
mem512gib: "memory = 549755813888.B"
mem1tib: "memory = 1099511627776.B"
mem2tib: "memory = 2199023255552.B"
mem4tib: "memory = 4398046511104.B"
mem8tib: "memory = 8796093022208.B"
mem16tib: "memory = 17592186044416.B"
mem32tib: "memory = 35184372088832.B"
mem64tib: "memory = 70368744177664.B"
mem128tib: "memory = 140737488355328.B"
mem256tib: "memory = 281474976710656.B"
mem512tib: "memory = 562949953421312.B"
cpu1: "cpus = 1"
cpu2: "cpus = 2"
cpu5: "cpus = 5"
cpu10: "cpus = 10"
cpu20: "cpus = 20"
cpu50: "cpus = 50"
cpu100: "cpus = 100"
cpu200: "cpus = 200"
cpu500: "cpus = 500"
cpu1000: "cpus = 1000"
script:
- "includeConfig(\"nextflow_labels.config\")"
debug: false
container: "docker"
engines:
- type: "docker"
id: "docker"
image: "python:3.10-slim"
target_registry: "images.viash-hub.com"
target_tag: "main"
namespace_separator: "/"
setup:
- type: "apt"
packages:
- "libhdf5-dev"
- "procps"
interactive: false
- type: "python"
user: false
packages:
- "scanpy~=1.10.4"
upgrade: true
- type: "python"
user: false
packages:
- "celltypist==1.6.3"
upgrade: true
- type: "python"
user: false
packages:
- "anndata~=0.11.1"
- "mudata~=0.3.1"
script:
- "exec(\"try:\\n import awkward\\nexcept ModuleNotFoundError:\\n exit(0)\\\
nelse: exit(1)\")"
upgrade: true
test_setup:
- type: "apt"
packages:
- "git"
interactive: false
- type: "python"
user: false
packages:
- "viashpy==0.8.0"
github:
- "openpipelines-bio/core#subdirectory=packages/python/openpipeline_testutils"
upgrade: true
entrypoint: []
cmd: null
- type: "native"
id: "native"
build_info:
config: "src/annotate/celltypist/config.vsh.yaml"
runner: "executable"
engine: "docker|native"
output: "target/executable/annotate/celltypist"
executable: "target/executable/annotate/celltypist/celltypist"
viash_version: "0.9.4"
git_commit: "173327cc5670aa8bd5cf473827de80b602c90092"
git_remote: "https://github.com/openpipelines-bio/openpipeline"
git_tag: "0.2.0-2055-g173327cc"
package_config:
name: "openpipeline"
version: "main"
summary: "Best-practice workflows for single-cell multi-omics analyses.\n"
description: "OpenPipelines are extensible single cell analysis pipelines for reproducible\
\ and large-scale single cell processing using [Viash](https://viash.io) and [Nextflow](https://www.nextflow.io/).\n\
\nIn terms of workflows, the following has been made available, but keep in mind\
\ that\nindividual tools and functionality can be executed as standalone components\
\ as well.\n\n * Demultiplexing: conversion of raw sequencing data to FASTQ objects.\n\
\ * Ingestion: Read mapping and generating a count matrix.\n * Single sample\
\ processing: cell filtering and doublet detection.\n * Multisample processing:\
\ Count transformation, normalization, QC metric calulations.\n * Integration:\
\ Clustering, integration and batch correction using single and multimodal methods.\n\
\ * Downstream analysis workflows\n"
info:
test_resources:
- type: "s3"
path: "s3://openpipelines-data"
dest: "resources_test"
nextflow_labels_ci:
- path: "src/workflows/utils/labels_ci.config"
description: "Adds the correct memory and CPU labels when running on the Viash\
\ Hub CI."
viash_version: "0.9.4"
source: "src"
target: "target"
config_mods:
- ".resources += {path: '/src/workflows/utils/labels.config', dest: 'nextflow_labels.config'}\n\
.runners[.type == 'nextflow'].config.script := 'includeConfig(\"nextflow_labels.config\"\
)'\n"
- ".engines += { type: \"native\" }"
- ".engines[.type == 'docker'].target_registry := 'images.viash-hub.com'"
- ".engines[.type == 'docker'].target_tag := 'main'"
keywords:
- "single-cell"
- "multimodal"
license: "MIT"
organization: "vsh"
links:
repository: "https://github.com/openpipelines-bio/openpipeline"
docker_registry: "ghcr.io"
homepage: "https://openpipelines.bio"
documentation: "https://openpipelines.bio/fundamentals"
issue_tracker: "https://github.com/openpipelines-bio/openpipeline/issues"

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,26 @@
from typing import List
def cross_check_genes(
query_genes: List[str], reference_genes: List[str], min_gene_overlap: int = 100
) -> List[str]:
"""Cross check the overlap between two lists of genes
Parameters
----------
query_genes : List[str]
List of gene names
reference_genes : List[str]
List of gene names
Returns
-------
List[str]
List of overlapping genes
"""
common_ens_ids = list(set(reference_genes).intersection(set(query_genes)))
assert len(common_ens_ids) >= min_gene_overlap, (
f"The intersection of genes between the query and reference dataset is too small, expected at least {min_gene_overlap}."
)
return common_ens_ids

View File

@@ -0,0 +1,48 @@
process {
// Default resources for components that hardly do any processing
memory = { 2.GB * task.attempt }
cpus = 1
// Retry for exit codes that have something to do with memory issues
errorStrategy = { task.exitStatus in 137..140 ? 'retry' : 'terminate' }
maxRetries = 3
// The memory a task is assinged increases with each attempt
// uncomment the line below and adjust the value to set a global upper limit on the memory.
// resourceLimits = [ memory: 240.Gb ]
// CPU resources
withLabel: singlecpu { cpus = 1 }
withLabel: lowcpu { cpus = 4 }
withLabel: midcpu { cpus = 10 }
withLabel: highcpu { cpus = 20 }
// Memory resources
withLabel: lowmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 4.GB * task.attempt } }
withLabel: midmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 25.GB * task.attempt } }
withLabel: highmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 50.GB * task.attempt } }
withLabel: veryhighmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 75.GB * task.attempt } }
// Disk space
withLabel: lowdisk {
disk = {process.disk ? process.disk : null}
}
withLabel: middisk {
disk = {process.disk ? process.disk : null}
}
withLabel: highdisk {
disk = {process.disk ? process.disk : null}
}
withLabel: veryhighdisk {
disk = {process.disk ? process.disk : null}
}
// NOTE: The above labels intentionally do not have an effect by default.
// The user should set the disk space requirements by adding the following
// to the compute environment:
//
// withLabel: lowdisk { disk = { 20.GB * task.attempt } }
// withLabel: middisk { disk = { 100.GB * task.attempt } }
// withLabel: highdisk { disk = { 200.GB * task.attempt } }
// withLabel: veryhighdisk { disk = { 500.GB * task.attempt } }
}

View File

@@ -0,0 +1,24 @@
import anndata as ad
import re
def set_var_index(adata: ad.AnnData, var_name: str | None = None) -> ad.AnnData:
"""Sanitize gene names and set the index of the .var DataFrame.
Parameters
----------
adata : AnnData
Annotated data object
var_name : str | None
Name of the column in `adata.var` that contains the gene names, if None, the existing index will be sanitized but not replaced.
Returns
-------
AnnData
Copy of `adata` with sanitized and replaced index
"""
if var_name:
adata.var.index = [re.sub("\\.[0-9]+$", "", s) for s in adata.var[var_name]]
else:
adata.var.index = [re.sub("\\.[0-9]+$", "", s) for s in adata.var.index]
return adata

View File

@@ -0,0 +1,12 @@
def setup_logger():
import logging
from sys import stdout
logger = logging.getLogger()
logger.setLevel(logging.INFO)
console_handler = logging.StreamHandler(stdout)
logFormatter = logging.Formatter("%(asctime)s %(levelname)-8s %(message)s")
console_handler.setFormatter(logFormatter)
logger.addHandler(console_handler)
return logger

View File

@@ -0,0 +1,31 @@
def subset_vars(adata, subset_col):
"""Subset AnnData object on highly variable genes
Parameters
----------
adata : AnnData
Annotated data object
subset_col : str
Name of the boolean column in `adata.var` that contains the information if features should be used or not
Returns
-------
AnnData
Copy of `adata` with subsetted features
"""
if subset_col not in adata.var.columns:
raise ValueError(
f"Requested to use .var column '{subset_col}' as a selection of genes, but the column is not available."
)
if adata.var[subset_col].dtype == "boolean":
assert adata.var[subset_col].isna().sum() == 0, (
f"The .var column `{subset_col}` contains NaN values. Can not subset data."
)
adata.var[subset_col] = adata.var[subset_col].astype("bool")
assert adata.var[subset_col].dtype == "bool", (
f"Expected dtype of .var column '{subset_col}' to be `bool`, but found {adata.var[subset_col].dtype}. Can not subset data."
)
return adata[:, adata.var[subset_col]].copy()

View File

@@ -0,0 +1,442 @@
name: "onclass"
namespace: "annotate"
version: "main"
authors:
- name: "Jakub Majercik"
roles:
- "author"
info:
role: "Contributor"
links:
email: "jakub@data-intuitive.com"
github: "jakubmajercik"
linkedin: "jakubmajercik"
organizations:
- name: "Data Intuitive"
href: "https://www.data-intuitive.com"
role: "Bioinformatics Engineer"
argument_groups:
- name: "Inputs"
description: "Input dataset (query) arguments"
arguments:
- type: "file"
name: "--input"
alternatives:
- "-i"
description: "The input (query) data to be labeled. Should be a .h5mu file."
info: null
example:
- "input.h5mu"
must_exist: true
create_parent: true
required: true
direction: "input"
multiple: false
multiple_sep: ";"
- type: "string"
name: "--modality"
description: "Which modality to process."
info: null
default:
- "rna"
required: false
direction: "input"
multiple: false
multiple_sep: ";"
- type: "string"
name: "--input_layer"
description: "The layer in the input data to be used for cell type annotation\
\ if .X is not to be used."
info: null
required: false
direction: "input"
multiple: false
multiple_sep: ";"
- type: "string"
name: "--input_var_gene_names"
description: "The name of the adata var column in the input data containing gene\
\ names; when no gene_name_layer is provided, the var index will be used.\n"
info: null
required: false
direction: "input"
multiple: false
multiple_sep: ";"
- type: "integer"
name: "--input_reference_gene_overlap"
description: "The minimum number of genes present in both the reference and query\
\ datasets.\n"
info: null
default:
- 100
required: false
min: 1
direction: "input"
multiple: false
multiple_sep: ";"
- name: "Ontology"
description: "Ontology input files"
arguments:
- type: "file"
name: "--cl_nlp_emb_file"
description: "The .nlp.emb file with the cell type embeddings."
info: null
must_exist: true
create_parent: true
required: true
direction: "input"
multiple: false
multiple_sep: ";"
- type: "file"
name: "--cl_ontology_file"
description: "The .ontology file with the cell type ontology."
info: null
must_exist: true
create_parent: true
required: true
direction: "input"
multiple: false
multiple_sep: ";"
- type: "file"
name: "--cl_obo_file"
description: "The .obo file with the cell type ontology."
info: null
must_exist: true
create_parent: true
required: true
direction: "input"
multiple: false
multiple_sep: ";"
- name: "Reference"
description: "Arguments related to the reference dataset."
arguments:
- type: "file"
name: "--reference"
description: "The reference data to train the CellTypist classifiers on. Only\
\ required if a pre-trained --model is not provided."
info: null
example:
- "reference.h5mu"
must_exist: true
create_parent: true
required: false
direction: "input"
multiple: false
multiple_sep: ";"
- type: "string"
name: "--reference_layer"
description: "The layer in the reference data to be used for cell type annotation\
\ if .X is not to be used."
info: null
required: false
direction: "input"
multiple: false
multiple_sep: ";"
- type: "string"
name: "--reference_obs_target"
description: "The name of the adata obs column in the reference data containing\
\ cell type annotations."
info: null
example:
- "cell_ontology_class"
required: true
direction: "input"
multiple: false
multiple_sep: ";"
- type: "string"
name: "--reference_var_gene_names"
description: "The name of the adata var column in the reference data containing\
\ gene names; when no gene_name_layer is provided, the var index will be used.\n"
info: null
required: false
direction: "input"
multiple: false
multiple_sep: ";"
- type: "string"
name: "--reference_var_input"
description: ".var column containing highly variable genes. By default, do not\
\ subset genes.\n"
info: null
required: false
direction: "input"
multiple: false
multiple_sep: ";"
- type: "string"
name: "--unknown_celltype"
description: "Label for unknown cell types.\n"
info: null
default:
- "Unknown"
required: false
direction: "input"
multiple: false
multiple_sep: ";"
- name: "Outputs"
description: "Output arguments."
arguments:
- type: "file"
name: "--output"
description: "Output h5mu file."
info: null
example:
- "output.h5mu"
must_exist: true
create_parent: true
required: false
direction: "output"
multiple: false
multiple_sep: ";"
- type: "string"
name: "--output_obs_predictions"
description: "In which `.obs` slots to store the predicted information.\n"
info: null
default:
- "onclass_pred"
required: false
direction: "input"
multiple: false
multiple_sep: ";"
- type: "string"
name: "--output_obs_probability"
description: "In which `.obs` slots to store the probability of the predictions.\n"
info: null
default:
- "onclass_prob"
required: false
direction: "input"
multiple: false
multiple_sep: ";"
- type: "string"
name: "--output_compression"
description: "Compression format to use for the output AnnData and/or Mudata objects.\n\
By default no compression is applied.\n"
info: null
example:
- "gzip"
required: false
choices:
- "gzip"
- "lzf"
direction: "input"
multiple: false
multiple_sep: ";"
- name: "Model arguments"
description: "Model arguments"
arguments:
- type: "string"
name: "--model"
description: "\"Pretrained model path without a file extension. If not provided,\
\ the model will be trained \non the reference data and --reference should be\
\ provided. The path namespace should contain:\n - a .npz or .pkl file\n -\
\ a .data file\n - a .meta file\n - a .index file\ne.g. /path/to/model/pretrained_model_target1\
\ as saved by OnClass.\"\n"
info: null
required: false
direction: "input"
multiple: false
multiple_sep: ";"
- type: "integer"
name: "--max_iter"
description: "Maximum number of iterations for training the model."
info: null
default:
- 30
required: false
direction: "input"
multiple: false
multiple_sep: ";"
resources:
- type: "python_script"
path: "script.py"
is_executable: true
- type: "file"
path: "setup_logger.py"
- type: "file"
path: "cross_check_genes.py"
- type: "file"
path: "subset_vars.py"
- type: "file"
path: "set_var_index.py"
- type: "file"
path: "nextflow_labels.config"
dest: "nextflow_labels.config"
description: "OnClass is a python package for single-cell cell type annotation. It\
\ uses the Cell Ontology to capture the cell type similarity. \nThese similarities\
\ enable OnClass to annotate cell types that are never seen in the training data.\n"
test_resources:
- type: "python_script"
path: "test.py"
is_executable: true
- type: "file"
path: "annotation_test_data"
- type: "file"
path: "pbmc_1k_protein_v3"
info: null
status: "enabled"
scope:
image: "public"
target: "public"
license: "MIT"
links:
repository: "https://github.com/openpipelines-bio/openpipeline"
docker_registry: "ghcr.io"
runners:
- type: "executable"
id: "executable"
docker_setup_strategy: "ifneedbepullelsecachedbuild"
- type: "nextflow"
id: "nextflow"
directives:
label:
- "highcpu"
- "highmem"
- "highdisk"
tag: "$id"
auto:
simplifyInput: true
simplifyOutput: false
transcript: false
publish: false
config:
labels:
mem1gb: "memory = 1000000000.B"
mem2gb: "memory = 2000000000.B"
mem5gb: "memory = 5000000000.B"
mem10gb: "memory = 10000000000.B"
mem20gb: "memory = 20000000000.B"
mem50gb: "memory = 50000000000.B"
mem100gb: "memory = 100000000000.B"
mem200gb: "memory = 200000000000.B"
mem500gb: "memory = 500000000000.B"
mem1tb: "memory = 1000000000000.B"
mem2tb: "memory = 2000000000000.B"
mem5tb: "memory = 5000000000000.B"
mem10tb: "memory = 10000000000000.B"
mem20tb: "memory = 20000000000000.B"
mem50tb: "memory = 50000000000000.B"
mem100tb: "memory = 100000000000000.B"
mem200tb: "memory = 200000000000000.B"
mem500tb: "memory = 500000000000000.B"
mem1gib: "memory = 1073741824.B"
mem2gib: "memory = 2147483648.B"
mem4gib: "memory = 4294967296.B"
mem8gib: "memory = 8589934592.B"
mem16gib: "memory = 17179869184.B"
mem32gib: "memory = 34359738368.B"
mem64gib: "memory = 68719476736.B"
mem128gib: "memory = 137438953472.B"
mem256gib: "memory = 274877906944.B"
mem512gib: "memory = 549755813888.B"
mem1tib: "memory = 1099511627776.B"
mem2tib: "memory = 2199023255552.B"
mem4tib: "memory = 4398046511104.B"
mem8tib: "memory = 8796093022208.B"
mem16tib: "memory = 17592186044416.B"
mem32tib: "memory = 35184372088832.B"
mem64tib: "memory = 70368744177664.B"
mem128tib: "memory = 140737488355328.B"
mem256tib: "memory = 281474976710656.B"
mem512tib: "memory = 562949953421312.B"
cpu1: "cpus = 1"
cpu2: "cpus = 2"
cpu5: "cpus = 5"
cpu10: "cpus = 10"
cpu20: "cpus = 20"
cpu50: "cpus = 50"
cpu100: "cpus = 100"
cpu200: "cpus = 200"
cpu500: "cpus = 500"
cpu1000: "cpus = 1000"
script:
- "includeConfig(\"nextflow_labels.config\")"
debug: false
container: "docker"
engines:
- type: "docker"
id: "docker"
image: "python:3.11"
target_registry: "images.viash-hub.com"
target_tag: "main"
namespace_separator: "/"
setup:
- type: "python"
user: false
packages:
- "anndata~=0.11.1"
- "mudata~=0.3.1"
- "OnClass~=1.3"
- "tensorflow"
- "obonet"
script:
- "exec(\"try:\\n import awkward\\nexcept ModuleNotFoundError:\\n exit(0)\\\
nelse: exit(1)\")"
upgrade: true
test_setup:
- type: "apt"
packages:
- "git"
interactive: false
- type: "python"
user: false
packages:
- "viashpy==0.8.0"
github:
- "openpipelines-bio/core#subdirectory=packages/python/openpipeline_testutils"
upgrade: true
entrypoint: []
cmd: null
- type: "native"
id: "native"
build_info:
config: "src/annotate/onclass/config.vsh.yaml"
runner: "executable"
engine: "docker|native"
output: "target/executable/annotate/onclass"
executable: "target/executable/annotate/onclass/onclass"
viash_version: "0.9.4"
git_commit: "173327cc5670aa8bd5cf473827de80b602c90092"
git_remote: "https://github.com/openpipelines-bio/openpipeline"
git_tag: "0.2.0-2055-g173327cc"
package_config:
name: "openpipeline"
version: "main"
summary: "Best-practice workflows for single-cell multi-omics analyses.\n"
description: "OpenPipelines are extensible single cell analysis pipelines for reproducible\
\ and large-scale single cell processing using [Viash](https://viash.io) and [Nextflow](https://www.nextflow.io/).\n\
\nIn terms of workflows, the following has been made available, but keep in mind\
\ that\nindividual tools and functionality can be executed as standalone components\
\ as well.\n\n * Demultiplexing: conversion of raw sequencing data to FASTQ objects.\n\
\ * Ingestion: Read mapping and generating a count matrix.\n * Single sample\
\ processing: cell filtering and doublet detection.\n * Multisample processing:\
\ Count transformation, normalization, QC metric calulations.\n * Integration:\
\ Clustering, integration and batch correction using single and multimodal methods.\n\
\ * Downstream analysis workflows\n"
info:
test_resources:
- type: "s3"
path: "s3://openpipelines-data"
dest: "resources_test"
nextflow_labels_ci:
- path: "src/workflows/utils/labels_ci.config"
description: "Adds the correct memory and CPU labels when running on the Viash\
\ Hub CI."
viash_version: "0.9.4"
source: "src"
target: "target"
config_mods:
- ".resources += {path: '/src/workflows/utils/labels.config', dest: 'nextflow_labels.config'}\n\
.runners[.type == 'nextflow'].config.script := 'includeConfig(\"nextflow_labels.config\"\
)'\n"
- ".engines += { type: \"native\" }"
- ".engines[.type == 'docker'].target_registry := 'images.viash-hub.com'"
- ".engines[.type == 'docker'].target_tag := 'main'"
keywords:
- "single-cell"
- "multimodal"
license: "MIT"
organization: "vsh"
links:
repository: "https://github.com/openpipelines-bio/openpipeline"
docker_registry: "ghcr.io"
homepage: "https://openpipelines.bio"
documentation: "https://openpipelines.bio/fundamentals"
issue_tracker: "https://github.com/openpipelines-bio/openpipeline/issues"

View File

@@ -0,0 +1,26 @@
from typing import List
def cross_check_genes(
query_genes: List[str], reference_genes: List[str], min_gene_overlap: int = 100
) -> List[str]:
"""Cross check the overlap between two lists of genes
Parameters
----------
query_genes : List[str]
List of gene names
reference_genes : List[str]
List of gene names
Returns
-------
List[str]
List of overlapping genes
"""
common_ens_ids = list(set(reference_genes).intersection(set(query_genes)))
assert len(common_ens_ids) >= min_gene_overlap, (
f"The intersection of genes between the query and reference dataset is too small, expected at least {min_gene_overlap}."
)
return common_ens_ids

View File

@@ -0,0 +1,48 @@
process {
// Default resources for components that hardly do any processing
memory = { 2.GB * task.attempt }
cpus = 1
// Retry for exit codes that have something to do with memory issues
errorStrategy = { task.exitStatus in 137..140 ? 'retry' : 'terminate' }
maxRetries = 3
// The memory a task is assinged increases with each attempt
// uncomment the line below and adjust the value to set a global upper limit on the memory.
// resourceLimits = [ memory: 240.Gb ]
// CPU resources
withLabel: singlecpu { cpus = 1 }
withLabel: lowcpu { cpus = 4 }
withLabel: midcpu { cpus = 10 }
withLabel: highcpu { cpus = 20 }
// Memory resources
withLabel: lowmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 4.GB * task.attempt } }
withLabel: midmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 25.GB * task.attempt } }
withLabel: highmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 50.GB * task.attempt } }
withLabel: veryhighmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 75.GB * task.attempt } }
// Disk space
withLabel: lowdisk {
disk = {process.disk ? process.disk : null}
}
withLabel: middisk {
disk = {process.disk ? process.disk : null}
}
withLabel: highdisk {
disk = {process.disk ? process.disk : null}
}
withLabel: veryhighdisk {
disk = {process.disk ? process.disk : null}
}
// NOTE: The above labels intentionally do not have an effect by default.
// The user should set the disk space requirements by adding the following
// to the compute environment:
//
// withLabel: lowdisk { disk = { 20.GB * task.attempt } }
// withLabel: middisk { disk = { 100.GB * task.attempt } }
// withLabel: highdisk { disk = { 200.GB * task.attempt } }
// withLabel: veryhighdisk { disk = { 500.GB * task.attempt } }
}

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,24 @@
import anndata as ad
import re
def set_var_index(adata: ad.AnnData, var_name: str | None = None) -> ad.AnnData:
"""Sanitize gene names and set the index of the .var DataFrame.
Parameters
----------
adata : AnnData
Annotated data object
var_name : str | None
Name of the column in `adata.var` that contains the gene names, if None, the existing index will be sanitized but not replaced.
Returns
-------
AnnData
Copy of `adata` with sanitized and replaced index
"""
if var_name:
adata.var.index = [re.sub("\\.[0-9]+$", "", s) for s in adata.var[var_name]]
else:
adata.var.index = [re.sub("\\.[0-9]+$", "", s) for s in adata.var.index]
return adata

View File

@@ -0,0 +1,12 @@
def setup_logger():
import logging
from sys import stdout
logger = logging.getLogger()
logger.setLevel(logging.INFO)
console_handler = logging.StreamHandler(stdout)
logFormatter = logging.Formatter("%(asctime)s %(levelname)-8s %(message)s")
console_handler.setFormatter(logFormatter)
logger.addHandler(console_handler)
return logger

View File

@@ -0,0 +1,31 @@
def subset_vars(adata, subset_col):
"""Subset AnnData object on highly variable genes
Parameters
----------
adata : AnnData
Annotated data object
subset_col : str
Name of the boolean column in `adata.var` that contains the information if features should be used or not
Returns
-------
AnnData
Copy of `adata` with subsetted features
"""
if subset_col not in adata.var.columns:
raise ValueError(
f"Requested to use .var column '{subset_col}' as a selection of genes, but the column is not available."
)
if adata.var[subset_col].dtype == "boolean":
assert adata.var[subset_col].isna().sum() == 0, (
f"The .var column `{subset_col}` contains NaN values. Can not subset data."
)
adata.var[subset_col] = adata.var[subset_col].astype("bool")
assert adata.var[subset_col].dtype == "bool", (
f"Expected dtype of .var column '{subset_col}' to be `bool`, but found {adata.var[subset_col].dtype}. Can not subset data."
)
return adata[:, adata.var[subset_col]].copy()

View File

@@ -0,0 +1,408 @@
name: "popv"
namespace: "annotate"
version: "main"
authors:
- name: "Matthias Beyens"
roles:
- "author"
info:
role: "Contributor"
links:
github: "MatthiasBeyens"
orcid: "0000-0003-3304-0706"
email: "matthias.beyens@gmail.com"
linkedin: "mbeyens"
organizations:
- name: "Janssen Pharmaceuticals"
href: "https://www.janssen.com"
role: "Principal Scientist"
- name: "Robrecht Cannoodt"
roles:
- "author"
info:
role: "Core Team Member"
links:
email: "robrecht@data-intuitive.com"
github: "rcannood"
orcid: "0000-0003-3641-729X"
linkedin: "robrechtcannoodt"
organizations:
- name: "Data Intuitive"
href: "https://www.data-intuitive.com"
role: "Data Science Engineer"
- name: "Open Problems"
href: "https://openproblems.bio"
role: "Core Member"
argument_groups:
- name: "Inputs"
description: "Arguments related to the input (aka query) dataset."
arguments:
- type: "file"
name: "--input"
alternatives:
- "-i"
description: "Input h5mu file."
info: null
example:
- "input.h5mu"
must_exist: true
create_parent: true
required: true
direction: "input"
multiple: false
multiple_sep: ";"
- type: "string"
name: "--modality"
description: "Which modality to process."
info: null
default:
- "rna"
required: false
direction: "input"
multiple: false
multiple_sep: ";"
- type: "string"
name: "--input_layer"
description: "Which layer to use. If no value is provided, the counts are assumed\
\ to be in the `.X` slot. Otherwise, count data is expected to be in `.layers[input_layer]`."
info: null
required: false
direction: "input"
multiple: false
multiple_sep: ";"
- type: "string"
name: "--input_obs_batch"
description: "Key in obs field of input adata for batch information. If no value\
\ is provided, batch label is assumed to be unknown."
info: null
required: false
direction: "input"
multiple: false
multiple_sep: ";"
- type: "string"
name: "--input_var_subset"
description: "Subset the input object with this column."
info: null
required: false
direction: "input"
multiple: false
multiple_sep: ";"
- type: "string"
name: "--input_obs_label"
description: "Key in obs field of input adata for label information. This is only\
\ used for training scANVI. Unlabelled cells should be set to `\"unknown_celltype_label\"\
`."
info: null
required: false
direction: "input"
multiple: false
multiple_sep: ";"
- type: "string"
name: "--unknown_celltype_label"
description: "If `input_obs_label` is specified, cells with this value will be\
\ treated as unknown and will be predicted by the model."
info: null
default:
- "unknown"
required: false
direction: "input"
multiple: false
multiple_sep: ";"
- name: "Reference"
description: "Arguments related to the reference dataset."
arguments:
- type: "file"
name: "--reference"
description: "User-provided reference tissue. The data that will be used as reference\
\ to call cell types."
info: null
example:
- "TS_Bladder_filtered.h5ad"
must_exist: true
create_parent: true
required: true
direction: "input"
multiple: false
multiple_sep: ";"
- type: "string"
name: "--reference_layer"
description: "Which layer to use. If no value is provided, the counts are assumed\
\ to be in the `.X` slot. Otherwise, count data is expected to be in `.layers[reference_layer]`."
info: null
required: false
direction: "input"
multiple: false
multiple_sep: ";"
- type: "string"
name: "--reference_obs_label"
description: "Key in obs field of reference AnnData with cell-type information."
info: null
default:
- "cell_ontology_class"
required: false
direction: "input"
multiple: false
multiple_sep: ";"
- type: "string"
name: "--reference_obs_batch"
description: "Key in obs field of input adata for batch information."
info: null
default:
- "donor_assay"
required: false
direction: "input"
multiple: false
multiple_sep: ";"
- name: "Outputs"
description: "Output arguments."
arguments:
- type: "file"
name: "--output"
description: "Output h5mu file."
info: null
example:
- "output.h5mu"
must_exist: true
create_parent: true
required: true
direction: "output"
multiple: false
multiple_sep: ";"
- type: "string"
name: "--output_compression"
description: "Compression format to use for the output AnnData and/or Mudata objects.\n\
By default no compression is applied.\n"
info: null
example:
- "gzip"
required: false
choices:
- "gzip"
- "lzf"
direction: "input"
multiple: false
multiple_sep: ";"
- name: "Arguments"
description: "Other arguments."
arguments:
- type: "string"
name: "--methods"
description: "Methods to call cell types. By default, runs to knn_on_scvi and\
\ scanvi."
info: null
example:
- "knn_on_scvi"
- "scanvi"
required: true
choices:
- "celltypist"
- "knn_on_bbknn"
- "knn_on_scanorama"
- "knn_on_scvi"
- "onclass"
- "rf"
- "scanvi"
- "svm"
direction: "input"
multiple: true
multiple_sep: ";"
resources:
- type: "python_script"
path: "script.py"
is_executable: true
- type: "file"
path: "setup_logger.py"
- type: "file"
path: "nextflow_labels.config"
dest: "nextflow_labels.config"
description: "Performs popular major vote cell typing on single cell sequence data\
\ using multiple algorithms. Note that this is a one-shot version of PopV."
test_resources:
- type: "python_script"
path: "test.py"
is_executable: true
- type: "file"
path: "annotation_test_data"
- type: "file"
path: "pbmc_1k_protein_v3"
info: null
status: "enabled"
scope:
image: "public"
target: "public"
license: "MIT"
links:
repository: "https://github.com/openpipelines-bio/openpipeline"
docker_registry: "ghcr.io"
runners:
- type: "executable"
id: "executable"
docker_setup_strategy: "ifneedbepullelsecachedbuild"
- type: "nextflow"
id: "nextflow"
directives:
label:
- "highmem"
- "highcpu"
- "highdisk"
tag: "$id"
auto:
simplifyInput: true
simplifyOutput: false
transcript: false
publish: false
config:
labels:
mem1gb: "memory = 1000000000.B"
mem2gb: "memory = 2000000000.B"
mem5gb: "memory = 5000000000.B"
mem10gb: "memory = 10000000000.B"
mem20gb: "memory = 20000000000.B"
mem50gb: "memory = 50000000000.B"
mem100gb: "memory = 100000000000.B"
mem200gb: "memory = 200000000000.B"
mem500gb: "memory = 500000000000.B"
mem1tb: "memory = 1000000000000.B"
mem2tb: "memory = 2000000000000.B"
mem5tb: "memory = 5000000000000.B"
mem10tb: "memory = 10000000000000.B"
mem20tb: "memory = 20000000000000.B"
mem50tb: "memory = 50000000000000.B"
mem100tb: "memory = 100000000000000.B"
mem200tb: "memory = 200000000000000.B"
mem500tb: "memory = 500000000000000.B"
mem1gib: "memory = 1073741824.B"
mem2gib: "memory = 2147483648.B"
mem4gib: "memory = 4294967296.B"
mem8gib: "memory = 8589934592.B"
mem16gib: "memory = 17179869184.B"
mem32gib: "memory = 34359738368.B"
mem64gib: "memory = 68719476736.B"
mem128gib: "memory = 137438953472.B"
mem256gib: "memory = 274877906944.B"
mem512gib: "memory = 549755813888.B"
mem1tib: "memory = 1099511627776.B"
mem2tib: "memory = 2199023255552.B"
mem4tib: "memory = 4398046511104.B"
mem8tib: "memory = 8796093022208.B"
mem16tib: "memory = 17592186044416.B"
mem32tib: "memory = 35184372088832.B"
mem64tib: "memory = 70368744177664.B"
mem128tib: "memory = 140737488355328.B"
mem256tib: "memory = 281474976710656.B"
mem512tib: "memory = 562949953421312.B"
cpu1: "cpus = 1"
cpu2: "cpus = 2"
cpu5: "cpus = 5"
cpu10: "cpus = 10"
cpu20: "cpus = 20"
cpu50: "cpus = 50"
cpu100: "cpus = 100"
cpu200: "cpus = 200"
cpu500: "cpus = 500"
cpu1000: "cpus = 1000"
script:
- "includeConfig(\"nextflow_labels.config\")"
debug: false
container: "docker"
engines:
- type: "docker"
id: "docker"
image: "python:3.11-slim"
target_registry: "images.viash-hub.com"
target_tag: "main"
namespace_separator: "/"
setup:
- type: "docker"
env:
- "CFLAGS=\"-mno-avx512f -mno-avx2\""
- "CPPFLAGS=\"-mno-avx512f -mno-avx2\""
- type: "apt"
packages:
- "procps"
- "git"
- "build-essential"
interactive: false
- type: "python"
user: false
packages:
- "popv~=0.4.2"
- "numpy<2"
- "setuptools"
upgrade: true
- type: "python"
user: false
packages:
- "anndata~=0.11.1"
- "mudata~=0.3.1"
script:
- "exec(\"try:\\n import awkward\\nexcept ModuleNotFoundError:\\n exit(0)\\\
nelse: exit(1)\")"
upgrade: true
- type: "docker"
run:
- "cd /opt && git clone --depth 1 https://github.com/YosefLab/PopV.git\n"
test_setup:
- type: "python"
user: false
packages:
- "viashpy==0.8.0"
upgrade: true
entrypoint: []
cmd: null
- type: "native"
id: "native"
build_info:
config: "src/annotate/popv/config.vsh.yaml"
runner: "executable"
engine: "docker|native"
output: "target/executable/annotate/popv"
executable: "target/executable/annotate/popv/popv"
viash_version: "0.9.4"
git_commit: "173327cc5670aa8bd5cf473827de80b602c90092"
git_remote: "https://github.com/openpipelines-bio/openpipeline"
git_tag: "0.2.0-2055-g173327cc"
package_config:
name: "openpipeline"
version: "main"
summary: "Best-practice workflows for single-cell multi-omics analyses.\n"
description: "OpenPipelines are extensible single cell analysis pipelines for reproducible\
\ and large-scale single cell processing using [Viash](https://viash.io) and [Nextflow](https://www.nextflow.io/).\n\
\nIn terms of workflows, the following has been made available, but keep in mind\
\ that\nindividual tools and functionality can be executed as standalone components\
\ as well.\n\n * Demultiplexing: conversion of raw sequencing data to FASTQ objects.\n\
\ * Ingestion: Read mapping and generating a count matrix.\n * Single sample\
\ processing: cell filtering and doublet detection.\n * Multisample processing:\
\ Count transformation, normalization, QC metric calulations.\n * Integration:\
\ Clustering, integration and batch correction using single and multimodal methods.\n\
\ * Downstream analysis workflows\n"
info:
test_resources:
- type: "s3"
path: "s3://openpipelines-data"
dest: "resources_test"
nextflow_labels_ci:
- path: "src/workflows/utils/labels_ci.config"
description: "Adds the correct memory and CPU labels when running on the Viash\
\ Hub CI."
viash_version: "0.9.4"
source: "src"
target: "target"
config_mods:
- ".resources += {path: '/src/workflows/utils/labels.config', dest: 'nextflow_labels.config'}\n\
.runners[.type == 'nextflow'].config.script := 'includeConfig(\"nextflow_labels.config\"\
)'\n"
- ".engines += { type: \"native\" }"
- ".engines[.type == 'docker'].target_registry := 'images.viash-hub.com'"
- ".engines[.type == 'docker'].target_tag := 'main'"
keywords:
- "single-cell"
- "multimodal"
license: "MIT"
organization: "vsh"
links:
repository: "https://github.com/openpipelines-bio/openpipeline"
docker_registry: "ghcr.io"
homepage: "https://openpipelines.bio"
documentation: "https://openpipelines.bio/fundamentals"
issue_tracker: "https://github.com/openpipelines-bio/openpipeline/issues"

View File

@@ -0,0 +1,48 @@
process {
// Default resources for components that hardly do any processing
memory = { 2.GB * task.attempt }
cpus = 1
// Retry for exit codes that have something to do with memory issues
errorStrategy = { task.exitStatus in 137..140 ? 'retry' : 'terminate' }
maxRetries = 3
// The memory a task is assinged increases with each attempt
// uncomment the line below and adjust the value to set a global upper limit on the memory.
// resourceLimits = [ memory: 240.Gb ]
// CPU resources
withLabel: singlecpu { cpus = 1 }
withLabel: lowcpu { cpus = 4 }
withLabel: midcpu { cpus = 10 }
withLabel: highcpu { cpus = 20 }
// Memory resources
withLabel: lowmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 4.GB * task.attempt } }
withLabel: midmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 25.GB * task.attempt } }
withLabel: highmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 50.GB * task.attempt } }
withLabel: veryhighmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 75.GB * task.attempt } }
// Disk space
withLabel: lowdisk {
disk = {process.disk ? process.disk : null}
}
withLabel: middisk {
disk = {process.disk ? process.disk : null}
}
withLabel: highdisk {
disk = {process.disk ? process.disk : null}
}
withLabel: veryhighdisk {
disk = {process.disk ? process.disk : null}
}
// NOTE: The above labels intentionally do not have an effect by default.
// The user should set the disk space requirements by adding the following
// to the compute environment:
//
// withLabel: lowdisk { disk = { 20.GB * task.attempt } }
// withLabel: middisk { disk = { 100.GB * task.attempt } }
// withLabel: highdisk { disk = { 200.GB * task.attempt } }
// withLabel: veryhighdisk { disk = { 500.GB * task.attempt } }
}

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,12 @@
def setup_logger():
import logging
from sys import stdout
logger = logging.getLogger()
logger.setLevel(logging.INFO)
console_handler = logging.StreamHandler(stdout)
logFormatter = logging.Formatter("%(asctime)s %(levelname)-8s %(message)s")
console_handler.setFormatter(logFormatter)
logger.addHandler(console_handler)
return logger

View File

@@ -0,0 +1,457 @@
name: "random_forest_annotation"
namespace: "annotate"
version: "main"
authors:
- name: "Jakub Majercik"
roles:
- "author"
info:
role: "Contributor"
links:
email: "jakub@data-intuitive.com"
github: "jakubmajercik"
linkedin: "jakubmajercik"
organizations:
- name: "Data Intuitive"
href: "https://www.data-intuitive.com"
role: "Bioinformatics Engineer"
argument_groups:
- name: "Inputs"
description: "Input dataset (query) arguments"
arguments:
- type: "file"
name: "--input"
description: "The input (query) data to be labeled. Should be a .h5mu file."
info: null
example:
- "input.h5mu"
must_exist: true
create_parent: true
required: true
direction: "input"
multiple: false
multiple_sep: ";"
- type: "string"
name: "--modality"
description: "Which modality to process."
info: null
default:
- "rna"
required: false
direction: "input"
multiple: false
multiple_sep: ";"
- type: "string"
name: "--input_layer"
description: "The layer in the input data to be used for cell type annotation\
\ if .X is not to be used."
info: null
required: false
direction: "input"
multiple: false
multiple_sep: ";"
- type: "string"
name: "--input_var_gene_names"
description: "The name of the adata var column in the input data containing gene\
\ names; when no gene_name_layer is provided, the var index will be used.\n"
info: null
required: false
direction: "input"
multiple: false
multiple_sep: ";"
- type: "integer"
name: "--input_reference_gene_overlap"
description: "The minimum number of genes present in both the reference and query\
\ datasets.\n"
info: null
default:
- 100
required: false
min: 1
direction: "input"
multiple: false
multiple_sep: ";"
- name: "Reference"
description: "Arguments related to the reference dataset."
arguments:
- type: "file"
name: "--reference"
description: "The reference data to train the CellTypist classifiers on. Only\
\ required if a pre-trained --model is not provided."
info: null
example:
- "reference.h5mu"
must_exist: true
create_parent: true
required: false
direction: "input"
multiple: false
multiple_sep: ";"
- type: "string"
name: "--reference_layer"
description: "The layer in the reference data to be used for cell type annotation\
\ if .X is not to be used. Data are expected to be processed in the same way\
\ as the --input query dataset."
info: null
required: false
direction: "input"
multiple: false
multiple_sep: ";"
- type: "string"
name: "--reference_obs_target"
description: "Key in obs field of reference modality with cell-type information."
info: null
required: true
direction: "input"
multiple: false
multiple_sep: ";"
- type: "string"
name: "--reference_var_gene_names"
description: "The name of the adata var column in the reference data containing\
\ gene names; when no gene_name_layer is provided, the var index will be used.\n"
info: null
required: false
direction: "input"
multiple: false
multiple_sep: ";"
- type: "string"
name: "--reference_var_input"
description: ".var column containing highly variable genes. By default, do not\
\ subset genes.\n"
info: null
required: false
direction: "input"
multiple: false
multiple_sep: ";"
- name: "Outputs"
description: "Output arguments."
arguments:
- type: "file"
name: "--output"
description: "Output h5mu file."
info: null
example:
- "output.h5mu"
must_exist: true
create_parent: true
required: false
direction: "output"
multiple: false
multiple_sep: ";"
- type: "string"
name: "--output_obs_predictions"
description: "In which `.obs` slots to store the predicted information.\n"
info: null
default:
- "random_forest_pred"
required: false
direction: "input"
multiple: false
multiple_sep: ";"
- type: "string"
name: "--output_obs_probability"
description: "In which `.obs` slots to store the probability of the predictions.\n"
info: null
default:
- "random_forest_probability"
required: false
direction: "input"
multiple: false
multiple_sep: ";"
- type: "string"
name: "--output_compression"
description: "Compression format to use for the output AnnData and/or Mudata objects.\n\
By default no compression is applied.\n"
info: null
example:
- "gzip"
required: false
choices:
- "gzip"
- "lzf"
direction: "input"
multiple: false
multiple_sep: ";"
- name: "Model arguments"
description: "Model arguments."
arguments:
- type: "file"
name: "--model"
description: "Pretrained model in pkl format. If not provided, the model will\
\ be trained on the reference data and --reference should be provided."
info: null
example:
- "pretrained_model.pkl"
must_exist: true
create_parent: true
required: false
direction: "input"
multiple: false
multiple_sep: ";"
- type: "integer"
name: "--n_estimators"
description: "Number of trees in the random forest."
info: null
default:
- 100
required: false
direction: "input"
multiple: false
multiple_sep: ";"
- type: "integer"
name: "--max_depth"
description: "Maximum depth of the trees in the random forest. \nIf not provided,\
\ the nodes are expanded until all leaves only contain a single sample.\n"
info: null
required: false
direction: "input"
multiple: false
multiple_sep: ";"
- type: "string"
name: "--criterion"
description: "The function to measure the quality of a split."
info: null
default:
- "gini"
required: false
choices:
- "gini"
- "entropy"
- "log_loss"
direction: "input"
multiple: false
multiple_sep: ";"
- type: "string"
name: "--class_weight"
description: "Weights associated with classes.\nThe `balanced` mode uses the values\
\ of y to automatically adjust weights inversely proportional to class frequencies\
\ in the input data.\nThe `balanced_subsample` mode is the same as `balanced`\
\ except that weights are computed based on the bootstrap sample for every tree\
\ grown.\nThe `uniform` mode gives all classes a weight of one.\n"
info: null
default:
- "balanced_subsample"
required: false
choices:
- "balanced"
- "balanced_subsample"
- "uniform"
direction: "input"
multiple: false
multiple_sep: ";"
- type: "string"
name: "--max_features"
description: "The number of features to consider when looking for the best split.\
\ The value can either be a positive integer or one of `sqrt`, `log2` or `all`.\n\
If integer: consider max_features features at each split.\nIf `sqrt`: max_features\
\ is the squareroot of all input features.\nIf `log2`: max_features is the log2\
\ of all input features.\nIf `all`: max features equals all input features.\n"
info: null
default:
- "200"
required: false
direction: "input"
multiple: false
multiple_sep: ";"
resources:
- type: "python_script"
path: "script.py"
is_executable: true
- type: "file"
path: "setup_logger.py"
- type: "file"
path: "cross_check_genes.py"
- type: "file"
path: "subset_vars.py"
- type: "file"
path: "set_var_index.py"
- type: "file"
path: "nextflow_labels.config"
dest: "nextflow_labels.config"
description: "Automated cell type annotation tool for scRNA-seq datasets on the basis\
\ of random forest."
test_resources:
- type: "python_script"
path: "test.py"
is_executable: true
- type: "file"
path: "TS_Blood_filtered.h5mu"
- type: "file"
path: "pbmc_1k_protein_v3_filtered_feature_bc_matrix.h5mu"
info: null
status: "enabled"
scope:
image: "public"
target: "public"
license: "MIT"
links:
repository: "https://github.com/openpipelines-bio/openpipeline"
docker_registry: "ghcr.io"
runners:
- type: "executable"
id: "executable"
docker_setup_strategy: "ifneedbepullelsecachedbuild"
- type: "nextflow"
id: "nextflow"
directives:
label:
- "highcpu"
- "highmem"
- "highdisk"
tag: "$id"
auto:
simplifyInput: true
simplifyOutput: false
transcript: false
publish: false
config:
labels:
mem1gb: "memory = 1000000000.B"
mem2gb: "memory = 2000000000.B"
mem5gb: "memory = 5000000000.B"
mem10gb: "memory = 10000000000.B"
mem20gb: "memory = 20000000000.B"
mem50gb: "memory = 50000000000.B"
mem100gb: "memory = 100000000000.B"
mem200gb: "memory = 200000000000.B"
mem500gb: "memory = 500000000000.B"
mem1tb: "memory = 1000000000000.B"
mem2tb: "memory = 2000000000000.B"
mem5tb: "memory = 5000000000000.B"
mem10tb: "memory = 10000000000000.B"
mem20tb: "memory = 20000000000000.B"
mem50tb: "memory = 50000000000000.B"
mem100tb: "memory = 100000000000000.B"
mem200tb: "memory = 200000000000000.B"
mem500tb: "memory = 500000000000000.B"
mem1gib: "memory = 1073741824.B"
mem2gib: "memory = 2147483648.B"
mem4gib: "memory = 4294967296.B"
mem8gib: "memory = 8589934592.B"
mem16gib: "memory = 17179869184.B"
mem32gib: "memory = 34359738368.B"
mem64gib: "memory = 68719476736.B"
mem128gib: "memory = 137438953472.B"
mem256gib: "memory = 274877906944.B"
mem512gib: "memory = 549755813888.B"
mem1tib: "memory = 1099511627776.B"
mem2tib: "memory = 2199023255552.B"
mem4tib: "memory = 4398046511104.B"
mem8tib: "memory = 8796093022208.B"
mem16tib: "memory = 17592186044416.B"
mem32tib: "memory = 35184372088832.B"
mem64tib: "memory = 70368744177664.B"
mem128tib: "memory = 140737488355328.B"
mem256tib: "memory = 281474976710656.B"
mem512tib: "memory = 562949953421312.B"
cpu1: "cpus = 1"
cpu2: "cpus = 2"
cpu5: "cpus = 5"
cpu10: "cpus = 10"
cpu20: "cpus = 20"
cpu50: "cpus = 50"
cpu100: "cpus = 100"
cpu200: "cpus = 200"
cpu500: "cpus = 500"
cpu1000: "cpus = 1000"
script:
- "includeConfig(\"nextflow_labels.config\")"
debug: false
container: "docker"
engines:
- type: "docker"
id: "docker"
image: "python:3.12-slim"
target_registry: "images.viash-hub.com"
target_tag: "main"
namespace_separator: "/"
setup:
- type: "apt"
packages:
- "libhdf5-dev"
- "procps"
interactive: false
- type: "python"
user: false
packages:
- "scikit-learn==1.4.2"
upgrade: true
- type: "python"
user: false
packages:
- "anndata~=0.11.1"
- "mudata~=0.3.1"
script:
- "exec(\"try:\\n import awkward\\nexcept ModuleNotFoundError:\\n exit(0)\\\
nelse: exit(1)\")"
upgrade: true
test_setup:
- type: "apt"
packages:
- "git"
interactive: false
- type: "python"
user: false
packages:
- "viashpy==0.8.0"
github:
- "openpipelines-bio/core#subdirectory=packages/python/openpipeline_testutils"
upgrade: true
entrypoint: []
cmd: null
- type: "native"
id: "native"
build_info:
config: "src/annotate/random_forest_annotation/config.vsh.yaml"
runner: "executable"
engine: "docker|native"
output: "target/executable/annotate/random_forest_annotation"
executable: "target/executable/annotate/random_forest_annotation/random_forest_annotation"
viash_version: "0.9.4"
git_commit: "173327cc5670aa8bd5cf473827de80b602c90092"
git_remote: "https://github.com/openpipelines-bio/openpipeline"
git_tag: "0.2.0-2055-g173327cc"
package_config:
name: "openpipeline"
version: "main"
summary: "Best-practice workflows for single-cell multi-omics analyses.\n"
description: "OpenPipelines are extensible single cell analysis pipelines for reproducible\
\ and large-scale single cell processing using [Viash](https://viash.io) and [Nextflow](https://www.nextflow.io/).\n\
\nIn terms of workflows, the following has been made available, but keep in mind\
\ that\nindividual tools and functionality can be executed as standalone components\
\ as well.\n\n * Demultiplexing: conversion of raw sequencing data to FASTQ objects.\n\
\ * Ingestion: Read mapping and generating a count matrix.\n * Single sample\
\ processing: cell filtering and doublet detection.\n * Multisample processing:\
\ Count transformation, normalization, QC metric calulations.\n * Integration:\
\ Clustering, integration and batch correction using single and multimodal methods.\n\
\ * Downstream analysis workflows\n"
info:
test_resources:
- type: "s3"
path: "s3://openpipelines-data"
dest: "resources_test"
nextflow_labels_ci:
- path: "src/workflows/utils/labels_ci.config"
description: "Adds the correct memory and CPU labels when running on the Viash\
\ Hub CI."
viash_version: "0.9.4"
source: "src"
target: "target"
config_mods:
- ".resources += {path: '/src/workflows/utils/labels.config', dest: 'nextflow_labels.config'}\n\
.runners[.type == 'nextflow'].config.script := 'includeConfig(\"nextflow_labels.config\"\
)'\n"
- ".engines += { type: \"native\" }"
- ".engines[.type == 'docker'].target_registry := 'images.viash-hub.com'"
- ".engines[.type == 'docker'].target_tag := 'main'"
keywords:
- "single-cell"
- "multimodal"
license: "MIT"
organization: "vsh"
links:
repository: "https://github.com/openpipelines-bio/openpipeline"
docker_registry: "ghcr.io"
homepage: "https://openpipelines.bio"
documentation: "https://openpipelines.bio/fundamentals"
issue_tracker: "https://github.com/openpipelines-bio/openpipeline/issues"

View File

@@ -0,0 +1,26 @@
from typing import List
def cross_check_genes(
query_genes: List[str], reference_genes: List[str], min_gene_overlap: int = 100
) -> List[str]:
"""Cross check the overlap between two lists of genes
Parameters
----------
query_genes : List[str]
List of gene names
reference_genes : List[str]
List of gene names
Returns
-------
List[str]
List of overlapping genes
"""
common_ens_ids = list(set(reference_genes).intersection(set(query_genes)))
assert len(common_ens_ids) >= min_gene_overlap, (
f"The intersection of genes between the query and reference dataset is too small, expected at least {min_gene_overlap}."
)
return common_ens_ids

View File

@@ -0,0 +1,48 @@
process {
// Default resources for components that hardly do any processing
memory = { 2.GB * task.attempt }
cpus = 1
// Retry for exit codes that have something to do with memory issues
errorStrategy = { task.exitStatus in 137..140 ? 'retry' : 'terminate' }
maxRetries = 3
// The memory a task is assinged increases with each attempt
// uncomment the line below and adjust the value to set a global upper limit on the memory.
// resourceLimits = [ memory: 240.Gb ]
// CPU resources
withLabel: singlecpu { cpus = 1 }
withLabel: lowcpu { cpus = 4 }
withLabel: midcpu { cpus = 10 }
withLabel: highcpu { cpus = 20 }
// Memory resources
withLabel: lowmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 4.GB * task.attempt } }
withLabel: midmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 25.GB * task.attempt } }
withLabel: highmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 50.GB * task.attempt } }
withLabel: veryhighmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 75.GB * task.attempt } }
// Disk space
withLabel: lowdisk {
disk = {process.disk ? process.disk : null}
}
withLabel: middisk {
disk = {process.disk ? process.disk : null}
}
withLabel: highdisk {
disk = {process.disk ? process.disk : null}
}
withLabel: veryhighdisk {
disk = {process.disk ? process.disk : null}
}
// NOTE: The above labels intentionally do not have an effect by default.
// The user should set the disk space requirements by adding the following
// to the compute environment:
//
// withLabel: lowdisk { disk = { 20.GB * task.attempt } }
// withLabel: middisk { disk = { 100.GB * task.attempt } }
// withLabel: highdisk { disk = { 200.GB * task.attempt } }
// withLabel: veryhighdisk { disk = { 500.GB * task.attempt } }
}

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,24 @@
import anndata as ad
import re
def set_var_index(adata: ad.AnnData, var_name: str | None = None) -> ad.AnnData:
"""Sanitize gene names and set the index of the .var DataFrame.
Parameters
----------
adata : AnnData
Annotated data object
var_name : str | None
Name of the column in `adata.var` that contains the gene names, if None, the existing index will be sanitized but not replaced.
Returns
-------
AnnData
Copy of `adata` with sanitized and replaced index
"""
if var_name:
adata.var.index = [re.sub("\\.[0-9]+$", "", s) for s in adata.var[var_name]]
else:
adata.var.index = [re.sub("\\.[0-9]+$", "", s) for s in adata.var.index]
return adata

View File

@@ -0,0 +1,12 @@
def setup_logger():
import logging
from sys import stdout
logger = logging.getLogger()
logger.setLevel(logging.INFO)
console_handler = logging.StreamHandler(stdout)
logFormatter = logging.Formatter("%(asctime)s %(levelname)-8s %(message)s")
console_handler.setFormatter(logFormatter)
logger.addHandler(console_handler)
return logger

View File

@@ -0,0 +1,31 @@
def subset_vars(adata, subset_col):
"""Subset AnnData object on highly variable genes
Parameters
----------
adata : AnnData
Annotated data object
subset_col : str
Name of the boolean column in `adata.var` that contains the information if features should be used or not
Returns
-------
AnnData
Copy of `adata` with subsetted features
"""
if subset_col not in adata.var.columns:
raise ValueError(
f"Requested to use .var column '{subset_col}' as a selection of genes, but the column is not available."
)
if adata.var[subset_col].dtype == "boolean":
assert adata.var[subset_col].isna().sum() == 0, (
f"The .var column `{subset_col}` contains NaN values. Can not subset data."
)
adata.var[subset_col] = adata.var[subset_col].astype("bool")
assert adata.var[subset_col].dtype == "bool", (
f"Expected dtype of .var column '{subset_col}' to be `bool`, but found {adata.var[subset_col].dtype}. Can not subset data."
)
return adata[:, adata.var[subset_col]].copy()

View File

@@ -0,0 +1,480 @@
name: "scanvi"
namespace: "annotate"
version: "main"
authors:
- name: "Dorien Roosen"
roles:
- "maintainer"
info:
role: "Core Team Member"
links:
email: "dorien@data-intuitive.com"
github: "dorien-er"
linkedin: "dorien-roosen"
organizations:
- name: "Data Intuitive"
href: "https://www.data-intuitive.com"
role: "Data Scientist"
- name: "Jakub Majercik"
roles:
- "author"
info:
role: "Contributor"
links:
email: "jakub@data-intuitive.com"
github: "jakubmajercik"
linkedin: "jakubmajercik"
organizations:
- name: "Data Intuitive"
href: "https://www.data-intuitive.com"
role: "Bioinformatics Engineer"
- name: "Weiwei Schultz"
roles:
- "contributor"
info:
role: "Contributor"
organizations:
- name: "Janssen R&D US"
role: "Associate Director Data Sciences"
argument_groups:
- name: "Inputs"
arguments:
- type: "file"
name: "--input"
alternatives:
- "-i"
description: "Input h5mu file. Note that this needs to be the exact same dataset\
\ as the --scvi_model was trained on."
info: null
must_exist: true
create_parent: true
required: true
direction: "input"
multiple: false
multiple_sep: ";"
- type: "string"
name: "--modality"
description: "Which modality from the input MuData file to process.\n"
info: null
default:
- "rna"
required: false
direction: "input"
multiple: false
multiple_sep: ";"
- type: "string"
name: "--input_layer"
description: "Input layer to use. If None, X is used"
info: null
required: false
direction: "input"
multiple: false
multiple_sep: ";"
- type: "string"
name: "--var_input"
description: ".var column containing highly variable genes that were used to train\
\ the scVi model. By default, do not subset genes."
info: null
required: false
direction: "input"
multiple: false
multiple_sep: ";"
- type: "string"
name: "--var_gene_names"
description: ".var column containing gene names. By default, use the index."
info: null
required: false
direction: "input"
multiple: false
multiple_sep: ";"
- type: "string"
name: "--obs_labels"
description: ".obs field containing the labels"
info: null
required: true
direction: "input"
multiple: false
multiple_sep: ";"
- type: "string"
name: "--unlabeled_category"
description: "Value in the --obs_labels field that indicates unlabeled observations\n"
info: null
default:
- "Unknown"
required: false
direction: "input"
multiple: false
multiple_sep: ";"
- name: "scVI Model"
arguments:
- type: "file"
name: "--scvi_model"
description: "Pretrained SCVI reference model to initialize the SCANVI model with."
info: null
example:
- "scvi_model.pt"
must_exist: true
create_parent: true
required: true
direction: "input"
multiple: false
multiple_sep: ";"
- name: "Outputs"
arguments:
- type: "file"
name: "--output"
alternatives:
- "-o"
description: "Output h5mu file."
info: null
must_exist: true
create_parent: true
required: true
direction: "output"
multiple: false
multiple_sep: ";"
- type: "file"
name: "--output_model"
description: "Folder where the state of the trained model will be saved to."
info: null
must_exist: true
create_parent: true
required: false
direction: "output"
multiple: false
multiple_sep: ";"
- type: "string"
name: "--obsm_output"
description: "In which .obsm slot to store the resulting integrated embedding."
info: null
default:
- "X_scanvi_integrated"
required: false
direction: "input"
multiple: false
multiple_sep: ";"
- type: "string"
name: "--obs_output_predictions"
description: "In which .obs slot to store the predicted labels."
info: null
default:
- "scanvi_pred"
required: false
direction: "input"
multiple: false
multiple_sep: ";"
- type: "string"
name: "--obs_output_probabilities"
description: "In which. obs slot to store the probabilities of the predicted labels."
info: null
default:
- "scanvi_proba"
required: false
direction: "input"
multiple: false
multiple_sep: ";"
- type: "string"
name: "--output_compression"
description: "Compression format to use for the output AnnData and/or Mudata objects.\n\
By default no compression is applied.\n"
info: null
example:
- "gzip"
required: false
choices:
- "gzip"
- "lzf"
direction: "input"
multiple: false
multiple_sep: ";"
- name: "scANVI training arguments"
arguments:
- type: "boolean"
name: "--early_stopping"
description: "Whether to perform early stopping with respect to the validation\
\ set."
info: null
required: false
direction: "input"
multiple: false
multiple_sep: ";"
- type: "string"
name: "--early_stopping_monitor"
description: "Metric logged during validation set epoch."
info: null
default:
- "elbo_validation"
required: false
choices:
- "elbo_validation"
- "reconstruction_loss_validation"
- "kl_local_validation"
direction: "input"
multiple: false
multiple_sep: ";"
- type: "integer"
name: "--early_stopping_patience"
description: "Number of validation epochs with no improvement after which training\
\ will be stopped."
info: null
default:
- 45
required: false
min: 1
direction: "input"
multiple: false
multiple_sep: ";"
- type: "double"
name: "--early_stopping_min_delta"
description: "Minimum change in the monitored quantity to qualify as an improvement,\
\ i.e. an absolute change of less than min_delta, will count as no improvement."
info: null
default:
- 0.0
required: false
min: 0.0
direction: "input"
multiple: false
multiple_sep: ";"
- type: "integer"
name: "--max_epochs"
description: "Number of passes through the dataset, defaults to (20000 / number\
\ of cells) * 400 or 400; whichever is smallest."
info: null
required: false
direction: "input"
multiple: false
multiple_sep: ";"
- type: "boolean"
name: "--reduce_lr_on_plateau"
description: "Whether to monitor validation loss and reduce learning rate when\
\ validation set `lr_scheduler_metric` plateaus."
info: null
default:
- true
required: false
direction: "input"
multiple: false
multiple_sep: ";"
- type: "double"
name: "--lr_factor"
description: "Factor to reduce learning rate."
info: null
default:
- 0.6
required: false
min: 0.0
direction: "input"
multiple: false
multiple_sep: ";"
- type: "double"
name: "--lr_patience"
description: "Number of epochs with no improvement after which learning rate will\
\ be reduced."
info: null
default:
- 30.0
required: false
min: 0.0
direction: "input"
multiple: false
multiple_sep: ";"
resources:
- type: "python_script"
path: "script.py"
is_executable: true
- type: "file"
path: "subset_vars.py"
- type: "file"
path: "compress_h5mu.py"
- type: "file"
path: "set_var_index.py"
- type: "file"
path: "setup_logger.py"
- type: "file"
path: "nextflow_labels.config"
dest: "nextflow_labels.config"
description: "scANVI () is a semi-supervised model for single-cell transcriptomics\
\ data. scANVI is an scVI extension that can leverage the cell type knowledge for\
\ a subset of the cells present in the data sets to infer the states of the rest\
\ of the cells.\nThis component will instantiate a scANVI model from a pre-trained\
\ scVI model, integrate the data and perform label prediction.\n"
test_resources:
- type: "python_script"
path: "test.py"
is_executable: true
- type: "file"
path: "scvi_model"
- type: "file"
path: "TS_Blood_filtered.h5mu"
- type: "file"
path: "pbmc_1k_protein_v3_mms.h5mu"
info: null
status: "enabled"
scope:
image: "public"
target: "public"
license: "MIT"
links:
repository: "https://github.com/openpipelines-bio/openpipeline"
docker_registry: "ghcr.io"
runners:
- type: "executable"
id: "executable"
docker_setup_strategy: "ifneedbepullelsecachedbuild"
- type: "nextflow"
id: "nextflow"
directives:
label:
- "midcpu"
- "midmem"
- "gpu"
- "highdisk"
tag: "$id"
auto:
simplifyInput: true
simplifyOutput: false
transcript: false
publish: false
config:
labels:
mem1gb: "memory = 1000000000.B"
mem2gb: "memory = 2000000000.B"
mem5gb: "memory = 5000000000.B"
mem10gb: "memory = 10000000000.B"
mem20gb: "memory = 20000000000.B"
mem50gb: "memory = 50000000000.B"
mem100gb: "memory = 100000000000.B"
mem200gb: "memory = 200000000000.B"
mem500gb: "memory = 500000000000.B"
mem1tb: "memory = 1000000000000.B"
mem2tb: "memory = 2000000000000.B"
mem5tb: "memory = 5000000000000.B"
mem10tb: "memory = 10000000000000.B"
mem20tb: "memory = 20000000000000.B"
mem50tb: "memory = 50000000000000.B"
mem100tb: "memory = 100000000000000.B"
mem200tb: "memory = 200000000000000.B"
mem500tb: "memory = 500000000000000.B"
mem1gib: "memory = 1073741824.B"
mem2gib: "memory = 2147483648.B"
mem4gib: "memory = 4294967296.B"
mem8gib: "memory = 8589934592.B"
mem16gib: "memory = 17179869184.B"
mem32gib: "memory = 34359738368.B"
mem64gib: "memory = 68719476736.B"
mem128gib: "memory = 137438953472.B"
mem256gib: "memory = 274877906944.B"
mem512gib: "memory = 549755813888.B"
mem1tib: "memory = 1099511627776.B"
mem2tib: "memory = 2199023255552.B"
mem4tib: "memory = 4398046511104.B"
mem8tib: "memory = 8796093022208.B"
mem16tib: "memory = 17592186044416.B"
mem32tib: "memory = 35184372088832.B"
mem64tib: "memory = 70368744177664.B"
mem128tib: "memory = 140737488355328.B"
mem256tib: "memory = 281474976710656.B"
mem512tib: "memory = 562949953421312.B"
cpu1: "cpus = 1"
cpu2: "cpus = 2"
cpu5: "cpus = 5"
cpu10: "cpus = 10"
cpu20: "cpus = 20"
cpu50: "cpus = 50"
cpu100: "cpus = 100"
cpu200: "cpus = 200"
cpu500: "cpus = 500"
cpu1000: "cpus = 1000"
script:
- "includeConfig(\"nextflow_labels.config\")"
debug: false
container: "docker"
engines:
- type: "docker"
id: "docker"
image: "nvcr.io/nvidia/pytorch:25.05-py3"
target_registry: "images.viash-hub.com"
target_tag: "main"
namespace_separator: "/"
setup:
- type: "python"
user: false
packages:
- "anndata~=0.11.1"
- "mudata~=0.3.1"
- "scanpy~=1.10.4"
script:
- "exec(\"try:\\n import awkward\\nexcept ModuleNotFoundError:\\n exit(0)\\\
nelse: exit(1)\")"
upgrade: true
- type: "python"
user: false
packages:
- "jax[cuda]"
- "scvi-tools~=1.3.1"
upgrade: true
test_setup:
- type: "python"
user: false
packages:
- "viashpy==0.8.0"
upgrade: true
entrypoint: []
cmd: null
- type: "native"
id: "native"
build_info:
config: "src/annotate/scanvi/config.vsh.yaml"
runner: "executable"
engine: "docker|native"
output: "target/executable/annotate/scanvi"
executable: "target/executable/annotate/scanvi/scanvi"
viash_version: "0.9.4"
git_commit: "173327cc5670aa8bd5cf473827de80b602c90092"
git_remote: "https://github.com/openpipelines-bio/openpipeline"
git_tag: "0.2.0-2055-g173327cc"
package_config:
name: "openpipeline"
version: "main"
summary: "Best-practice workflows for single-cell multi-omics analyses.\n"
description: "OpenPipelines are extensible single cell analysis pipelines for reproducible\
\ and large-scale single cell processing using [Viash](https://viash.io) and [Nextflow](https://www.nextflow.io/).\n\
\nIn terms of workflows, the following has been made available, but keep in mind\
\ that\nindividual tools and functionality can be executed as standalone components\
\ as well.\n\n * Demultiplexing: conversion of raw sequencing data to FASTQ objects.\n\
\ * Ingestion: Read mapping and generating a count matrix.\n * Single sample\
\ processing: cell filtering and doublet detection.\n * Multisample processing:\
\ Count transformation, normalization, QC metric calulations.\n * Integration:\
\ Clustering, integration and batch correction using single and multimodal methods.\n\
\ * Downstream analysis workflows\n"
info:
test_resources:
- type: "s3"
path: "s3://openpipelines-data"
dest: "resources_test"
nextflow_labels_ci:
- path: "src/workflows/utils/labels_ci.config"
description: "Adds the correct memory and CPU labels when running on the Viash\
\ Hub CI."
viash_version: "0.9.4"
source: "src"
target: "target"
config_mods:
- ".resources += {path: '/src/workflows/utils/labels.config', dest: 'nextflow_labels.config'}\n\
.runners[.type == 'nextflow'].config.script := 'includeConfig(\"nextflow_labels.config\"\
)'\n"
- ".engines += { type: \"native\" }"
- ".engines[.type == 'docker'].target_registry := 'images.viash-hub.com'"
- ".engines[.type == 'docker'].target_tag := 'main'"
keywords:
- "single-cell"
- "multimodal"
license: "MIT"
organization: "vsh"
links:
repository: "https://github.com/openpipelines-bio/openpipeline"
docker_registry: "ghcr.io"
homepage: "https://openpipelines.bio"
documentation: "https://openpipelines.bio/fundamentals"
issue_tracker: "https://github.com/openpipelines-bio/openpipeline/issues"

View File

@@ -0,0 +1,87 @@
import shutil
from anndata import AnnData
from mudata import write_h5ad
from h5py import File as H5File
from h5py import Group, Dataset
from pathlib import Path
from typing import Union, Literal
from functools import partial
def compress_h5mu(
input_path: Union[str, Path],
output_path: Union[str, Path],
compression: Union[Literal["gzip"], Literal["lzf"]],
):
input_path, output_path = str(input_path), str(output_path)
def copy_attributes(in_object, out_object):
for key, value in in_object.attrs.items():
out_object.attrs[key] = value
def visit_path(
output_h5: H5File,
compression: Union[Literal["gzip"], Literal["lzf"]],
name: str,
object: Union[Group, Dataset],
):
if isinstance(object, Group):
new_group = output_h5.create_group(name)
copy_attributes(object, new_group)
elif isinstance(object, Dataset):
# Compression only works for non-scalar Dataset objects
# Scalar objects dont have a shape defined
if not object.compression and object.shape not in [None, ()]:
new_dataset = output_h5.create_dataset(
name, data=object, compression=compression
)
copy_attributes(object, new_dataset)
else:
output_h5.copy(object, name)
else:
raise NotImplementedError(
f"Could not copy element {name}, "
f"type has not been implemented yet: {type(object)}"
)
with (
H5File(input_path, "r") as input_h5,
H5File(output_path, "w", userblock_size=512) as output_h5,
):
copy_attributes(input_h5, output_h5)
input_h5.visititems(partial(visit_path, output_h5, compression))
with open(input_path, "rb") as input_bytes:
# Mudata puts metadata like this in the first 512 bytes:
# MuData (format-version=0.1.0;creator=muon;creator-version=0.2.0)
# See mudata/_core/io.py, read_h5mu() function
starting_metadata = input_bytes.read(100)
# The metadata is padded with extra null bytes up until 512 bytes
truncate_location = starting_metadata.find(b"\x00")
starting_metadata = starting_metadata[:truncate_location]
with open(output_path, "br+") as f:
nbytes = f.write(starting_metadata)
f.write(b"\0" * (512 - nbytes))
def write_h5ad_to_h5mu_with_compression(
output_file: Union[str, Path],
h5mu: Union[str, Path],
modality_name: str,
modality_data: AnnData,
output_compression=None,
):
output_file = Path(output_file)
h5mu = Path(h5mu)
output_file_uncompressed = (
output_file.with_name(output_file.stem + "_uncompressed.h5mu")
if output_compression
else output_file
)
shutil.copyfile(h5mu, output_file_uncompressed)
write_h5ad(filename=output_file_uncompressed, mod=modality_name, data=modality_data)
if output_compression:
compress_h5mu(
output_file_uncompressed, output_file, compression=output_compression
)
output_file_uncompressed.unlink()

View File

@@ -0,0 +1,48 @@
process {
// Default resources for components that hardly do any processing
memory = { 2.GB * task.attempt }
cpus = 1
// Retry for exit codes that have something to do with memory issues
errorStrategy = { task.exitStatus in 137..140 ? 'retry' : 'terminate' }
maxRetries = 3
// The memory a task is assinged increases with each attempt
// uncomment the line below and adjust the value to set a global upper limit on the memory.
// resourceLimits = [ memory: 240.Gb ]
// CPU resources
withLabel: singlecpu { cpus = 1 }
withLabel: lowcpu { cpus = 4 }
withLabel: midcpu { cpus = 10 }
withLabel: highcpu { cpus = 20 }
// Memory resources
withLabel: lowmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 4.GB * task.attempt } }
withLabel: midmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 25.GB * task.attempt } }
withLabel: highmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 50.GB * task.attempt } }
withLabel: veryhighmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 75.GB * task.attempt } }
// Disk space
withLabel: lowdisk {
disk = {process.disk ? process.disk : null}
}
withLabel: middisk {
disk = {process.disk ? process.disk : null}
}
withLabel: highdisk {
disk = {process.disk ? process.disk : null}
}
withLabel: veryhighdisk {
disk = {process.disk ? process.disk : null}
}
// NOTE: The above labels intentionally do not have an effect by default.
// The user should set the disk space requirements by adding the following
// to the compute environment:
//
// withLabel: lowdisk { disk = { 20.GB * task.attempt } }
// withLabel: middisk { disk = { 100.GB * task.attempt } }
// withLabel: highdisk { disk = { 200.GB * task.attempt } }
// withLabel: veryhighdisk { disk = { 500.GB * task.attempt } }
}

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,24 @@
import anndata as ad
import re
def set_var_index(adata: ad.AnnData, var_name: str | None = None) -> ad.AnnData:
"""Sanitize gene names and set the index of the .var DataFrame.
Parameters
----------
adata : AnnData
Annotated data object
var_name : str | None
Name of the column in `adata.var` that contains the gene names, if None, the existing index will be sanitized but not replaced.
Returns
-------
AnnData
Copy of `adata` with sanitized and replaced index
"""
if var_name:
adata.var.index = [re.sub("\\.[0-9]+$", "", s) for s in adata.var[var_name]]
else:
adata.var.index = [re.sub("\\.[0-9]+$", "", s) for s in adata.var.index]
return adata

View File

@@ -0,0 +1,12 @@
def setup_logger():
import logging
from sys import stdout
logger = logging.getLogger()
logger.setLevel(logging.INFO)
console_handler = logging.StreamHandler(stdout)
logFormatter = logging.Formatter("%(asctime)s %(levelname)-8s %(message)s")
console_handler.setFormatter(logFormatter)
logger.addHandler(console_handler)
return logger

View File

@@ -0,0 +1,31 @@
def subset_vars(adata, subset_col):
"""Subset AnnData object on highly variable genes
Parameters
----------
adata : AnnData
Annotated data object
subset_col : str
Name of the boolean column in `adata.var` that contains the information if features should be used or not
Returns
-------
AnnData
Copy of `adata` with subsetted features
"""
if subset_col not in adata.var.columns:
raise ValueError(
f"Requested to use .var column '{subset_col}' as a selection of genes, but the column is not available."
)
if adata.var[subset_col].dtype == "boolean":
assert adata.var[subset_col].isna().sum() == 0, (
f"The .var column `{subset_col}` contains NaN values. Can not subset data."
)
adata.var[subset_col] = adata.var[subset_col].astype("bool")
assert adata.var[subset_col].dtype == "bool", (
f"Expected dtype of .var column '{subset_col}' to be `bool`, but found {adata.var[subset_col].dtype}. Can not subset data."
)
return adata[:, adata.var[subset_col]].copy()

View File

@@ -0,0 +1,519 @@
name: "singler"
namespace: "annotate"
version: "main"
authors:
- name: "Dorien Roosen"
roles:
- "author"
info:
role: "Core Team Member"
links:
email: "dorien@data-intuitive.com"
github: "dorien-er"
linkedin: "dorien-roosen"
organizations:
- name: "Data Intuitive"
href: "https://www.data-intuitive.com"
role: "Data Scientist"
- name: "Weiwei Schultz"
roles:
- "contributor"
info:
role: "Contributor"
organizations:
- name: "Janssen R&D US"
role: "Associate Director Data Sciences"
argument_groups:
- name: "Inputs"
description: "Input dataset (query) arguments"
arguments:
- type: "file"
name: "--input"
alternatives:
- "-i"
description: "The input (query) data to be labeled. Should be a .h5mu file."
info: null
example:
- "input.h5mu"
must_exist: true
create_parent: true
required: true
direction: "input"
multiple: false
multiple_sep: ";"
- type: "string"
name: "--modality"
description: "Which modality to process."
info: null
default:
- "rna"
required: false
direction: "input"
multiple: false
multiple_sep: ";"
- type: "string"
name: "--input_layer"
description: "The layer in the input data containing log normalized counts to\
\ be used for cell type annotation if .X is not to be used."
info: null
required: false
direction: "input"
multiple: false
multiple_sep: ";"
- type: "string"
name: "--input_var_gene_names"
description: "The name of the adata .var column in the input data containing gene\
\ names; when no gene_name_layer is provided, the var index will be used.\n"
info: null
required: false
direction: "input"
multiple: false
multiple_sep: ";"
- type: "string"
name: "--input_obs_clusters"
description: "The name of the adata .obs column containing cluster identities\
\ of the observations. \nIf provided, annoation is performed on the aggregated\
\ cluster profiles, \notherwise it defaults to annotation per observation.\n"
info: null
required: false
direction: "input"
multiple: false
multiple_sep: ";"
- type: "integer"
name: "--input_reference_gene_overlap"
description: "The minimum number of genes present in both the reference and query\
\ datasets.\n"
info: null
default:
- 100
required: false
min: 1
direction: "input"
multiple: false
multiple_sep: ";"
- name: "Reference"
description: "Arguments related to the reference dataset."
arguments:
- type: "file"
name: "--reference"
description: "The reference data to train the CellTypist classifiers on. Only\
\ required if a pre-trained --model is not provided."
info: null
example:
- "reference.h5mu"
must_exist: true
create_parent: true
required: true
direction: "input"
multiple: false
multiple_sep: ";"
- type: "string"
name: "--reference_layer"
description: "The layer in the reference data containing lognormalized couns to\
\ be used for cell type annotation if .X is not to be used. Data are expected\
\ to be processed in the same way as the --input query dataset."
info: null
required: false
direction: "input"
multiple: false
multiple_sep: ";"
- type: "string"
name: "--reference_obs_target"
description: "The name of the adata obs column in the reference data containing\
\ cell type annotations."
info: null
required: true
direction: "input"
multiple: false
multiple_sep: ";"
- type: "string"
name: "--reference_var_gene_names"
description: "The name of the adata var column in the reference data containing\
\ gene names; when no gene_name_layer is provided, the var index will be used.\n"
info: null
required: false
direction: "input"
multiple: false
multiple_sep: ";"
- type: "string"
name: "--reference_var_input"
description: ".var column containing a boolean mask corresponding to genes to\
\ be used for marker selection. By default, do not subset genes.\n"
info: null
required: false
direction: "input"
multiple: false
multiple_sep: ";"
- name: "Arguments"
description: "Arguments related to the training of and classification with the SingleR\
\ model"
arguments:
- type: "integer"
name: "--de_n_genes"
description: "The number of differentially expressed genes across labels to be\
\ calculated from the reference.\nDefaults to 500 * (2/3) ^ log2(N) where N\
\ is the number of unique labels when if `--de_method` is set to `classic`,\n\
otherwise, defaults to 10.\n"
info: null
required: false
min: 1
direction: "input"
multiple: false
multiple_sep: ";"
- type: "string"
name: "--de_method"
description: "Method to detect differentially expressed genes between pairs of\
\ labels."
info: null
default:
- "classic"
required: false
choices:
- "classic"
- "t"
- "wilcox"
direction: "input"
multiple: false
multiple_sep: ";"
- type: "double"
name: "--quantile"
description: "The quantile of the correlation distribution to use to compute the\
\ score per label."
info: null
default:
- 0.8
required: false
min: 0.0
max: 1.0
direction: "input"
multiple: false
multiple_sep: ";"
- type: "boolean"
name: "--fine_tune"
description: "Whether finetuning should be performed to improve the resolution.\
\ \nIf set to True, an additional finetuning step is performed after initial\
\ classification, \nnew marker genes are calculated based on all cells with\
\ a score higher then the maximum score minus `--fine_tuning_thershold`,\nand\
\ the calculation of the scores is repeated.\n"
info: null
default:
- true
required: false
direction: "input"
multiple: false
multiple_sep: ";"
- type: "double"
name: "--fine_tuning_threshold"
description: "The maximum difference from the maximum correlation to use in fine-tuning\n"
info: null
default:
- 0.05
required: false
direction: "input"
multiple: false
multiple_sep: ";"
- type: "boolean"
name: "--prune"
description: "Whether label pruning should be performed. If set to True, an additional\
\ output .obs field `--output_obs_pruned_predictions` will be added to the `--output`,\
\ containing labels where 'low-quality' labels are replaced with NA's. Labels\
\ are considered 'low-quality' when their delta score (stored in `--output_obs_delta_next`)\
\ fall more than 3 median absolute deviations below the median for that label\
\ type."
info: null
default:
- true
required: false
direction: "input"
multiple: false
multiple_sep: ";"
- name: "Outputs"
description: "Output arguments."
arguments:
- type: "file"
name: "--output"
description: "Output h5mu file."
info: null
example:
- "output.h5mu"
must_exist: true
create_parent: true
required: false
direction: "output"
multiple: false
multiple_sep: ";"
- type: "string"
name: "--output_obs_predictions"
description: "In which `.obs` slot to store the predicted labels. If `--fine_tune\
\ False`, this is based only on the maximum entry in `--output_obsm_scores`.\n"
info: null
default:
- "singler_pred"
required: false
direction: "input"
multiple: false
multiple_sep: ";"
- type: "string"
name: "--output_obs_probability"
description: "In which `.obs` slots to store the probability of the predicted\
\ labels.\n"
info: null
default:
- "singler_probability"
required: false
direction: "input"
multiple: false
multiple_sep: ";"
- type: "string"
name: "--output_obs_delta_next"
description: "In which `.obs` slot to store the delta between the best and next-best\
\ score. If `--fine_tune True`, this is reported for scores after fine-tuning.\n"
info: null
default:
- "singler_delta_next"
required: false
direction: "input"
multiple: false
multiple_sep: ";"
- type: "string"
name: "--output_obs_pruned_predictions"
description: "In which `.obs` slot to store the pruned labels, where low-quality\
\ labels are replaced with NA's. Only added if `--prune True`.\n"
info: null
default:
- "singler_pruned_labels"
required: false
direction: "input"
multiple: false
multiple_sep: ";"
- type: "string"
name: "--output_obsm_scores"
description: "In which `.obsm` slot to store the matrix of prediction correlations\
\ at the specified quantile for each label (column) in each cell (row).\n"
info: null
default:
- "singler_scores"
required: false
direction: "input"
multiple: false
multiple_sep: ";"
- type: "string"
name: "--output_compression"
description: "Compression format to use for the output AnnData and/or Mudata objects.\n\
By default no compression is applied.\n"
info: null
example:
- "gzip"
required: false
choices:
- "gzip"
- "lzf"
direction: "input"
multiple: false
multiple_sep: ";"
resources:
- type: "r_script"
path: "script.R"
is_executable: true
- type: "file"
path: "nextflow_labels.config"
dest: "nextflow_labels.config"
description: "SingleR performs reference-based cell type annotation for single-cell\
\ RNA-seq data \nby computing Spearman correlations between test cells and reference\
\ samples with known labels, \nusing marker genes to assign the most similar cell\
\ type label to each new cell.\n"
test_resources:
- type: "python_script"
path: "test.py"
is_executable: true
- type: "file"
path: "TS_Blood_filtered.h5mu"
- type: "file"
path: "pbmc_1k_protein_v3_filtered_feature_bc_matrix.h5mu"
info: null
status: "enabled"
scope:
image: "public"
target: "public"
license: "MIT"
links:
repository: "https://github.com/openpipelines-bio/openpipeline"
docker_registry: "ghcr.io"
runners:
- type: "executable"
id: "executable"
docker_setup_strategy: "ifneedbepullelsecachedbuild"
- type: "nextflow"
id: "nextflow"
directives:
label:
- "lowmem"
- "lowcpu"
tag: "$id"
auto:
simplifyInput: true
simplifyOutput: false
transcript: false
publish: false
config:
labels:
mem1gb: "memory = 1000000000.B"
mem2gb: "memory = 2000000000.B"
mem5gb: "memory = 5000000000.B"
mem10gb: "memory = 10000000000.B"
mem20gb: "memory = 20000000000.B"
mem50gb: "memory = 50000000000.B"
mem100gb: "memory = 100000000000.B"
mem200gb: "memory = 200000000000.B"
mem500gb: "memory = 500000000000.B"
mem1tb: "memory = 1000000000000.B"
mem2tb: "memory = 2000000000000.B"
mem5tb: "memory = 5000000000000.B"
mem10tb: "memory = 10000000000000.B"
mem20tb: "memory = 20000000000000.B"
mem50tb: "memory = 50000000000000.B"
mem100tb: "memory = 100000000000000.B"
mem200tb: "memory = 200000000000000.B"
mem500tb: "memory = 500000000000000.B"
mem1gib: "memory = 1073741824.B"
mem2gib: "memory = 2147483648.B"
mem4gib: "memory = 4294967296.B"
mem8gib: "memory = 8589934592.B"
mem16gib: "memory = 17179869184.B"
mem32gib: "memory = 34359738368.B"
mem64gib: "memory = 68719476736.B"
mem128gib: "memory = 137438953472.B"
mem256gib: "memory = 274877906944.B"
mem512gib: "memory = 549755813888.B"
mem1tib: "memory = 1099511627776.B"
mem2tib: "memory = 2199023255552.B"
mem4tib: "memory = 4398046511104.B"
mem8tib: "memory = 8796093022208.B"
mem16tib: "memory = 17592186044416.B"
mem32tib: "memory = 35184372088832.B"
mem64tib: "memory = 70368744177664.B"
mem128tib: "memory = 140737488355328.B"
mem256tib: "memory = 281474976710656.B"
mem512tib: "memory = 562949953421312.B"
cpu1: "cpus = 1"
cpu2: "cpus = 2"
cpu5: "cpus = 5"
cpu10: "cpus = 10"
cpu20: "cpus = 20"
cpu50: "cpus = 50"
cpu100: "cpus = 100"
cpu200: "cpus = 200"
cpu500: "cpus = 500"
cpu1000: "cpus = 1000"
script:
- "includeConfig(\"nextflow_labels.config\")"
debug: false
container: "docker"
engines:
- type: "docker"
id: "docker"
image: "rocker/r2u:22.04"
target_registry: "images.viash-hub.com"
target_tag: "main"
namespace_separator: "/"
setup:
- type: "docker"
env:
- "RETICULATE_PYTHON=/usr/bin/python"
- type: "apt"
packages:
- "libhdf5-dev"
- "python3"
- "python3-pip"
- "python3-dev"
- "python-is-python3"
interactive: false
- type: "r"
cran:
- "anndata"
- "reticulate"
- "SingleR"
bioc:
- "scrapper"
- "bit64"
bioc_force_install: false
warnings_as_errors: true
- type: "python"
user: false
packages:
- "anndata~=0.11.1"
- "mudata~=0.3.1"
script:
- "exec(\"try:\\n import awkward\\nexcept ModuleNotFoundError:\\n exit(0)\\\
nelse: exit(1)\")"
upgrade: true
test_setup:
- type: "apt"
packages:
- "git"
interactive: false
- type: "python"
user: false
packages:
- "viashpy==0.8.0"
github:
- "openpipelines-bio/core#subdirectory=packages/python/openpipeline_testutils"
upgrade: true
entrypoint: []
cmd: null
- type: "native"
id: "native"
build_info:
config: "src/annotate/singler/config.vsh.yaml"
runner: "executable"
engine: "docker|native"
output: "target/executable/annotate/singler"
executable: "target/executable/annotate/singler/singler"
viash_version: "0.9.4"
git_commit: "173327cc5670aa8bd5cf473827de80b602c90092"
git_remote: "https://github.com/openpipelines-bio/openpipeline"
git_tag: "0.2.0-2055-g173327cc"
package_config:
name: "openpipeline"
version: "main"
summary: "Best-practice workflows for single-cell multi-omics analyses.\n"
description: "OpenPipelines are extensible single cell analysis pipelines for reproducible\
\ and large-scale single cell processing using [Viash](https://viash.io) and [Nextflow](https://www.nextflow.io/).\n\
\nIn terms of workflows, the following has been made available, but keep in mind\
\ that\nindividual tools and functionality can be executed as standalone components\
\ as well.\n\n * Demultiplexing: conversion of raw sequencing data to FASTQ objects.\n\
\ * Ingestion: Read mapping and generating a count matrix.\n * Single sample\
\ processing: cell filtering and doublet detection.\n * Multisample processing:\
\ Count transformation, normalization, QC metric calulations.\n * Integration:\
\ Clustering, integration and batch correction using single and multimodal methods.\n\
\ * Downstream analysis workflows\n"
info:
test_resources:
- type: "s3"
path: "s3://openpipelines-data"
dest: "resources_test"
nextflow_labels_ci:
- path: "src/workflows/utils/labels_ci.config"
description: "Adds the correct memory and CPU labels when running on the Viash\
\ Hub CI."
viash_version: "0.9.4"
source: "src"
target: "target"
config_mods:
- ".resources += {path: '/src/workflows/utils/labels.config', dest: 'nextflow_labels.config'}\n\
.runners[.type == 'nextflow'].config.script := 'includeConfig(\"nextflow_labels.config\"\
)'\n"
- ".engines += { type: \"native\" }"
- ".engines[.type == 'docker'].target_registry := 'images.viash-hub.com'"
- ".engines[.type == 'docker'].target_tag := 'main'"
keywords:
- "single-cell"
- "multimodal"
license: "MIT"
organization: "vsh"
links:
repository: "https://github.com/openpipelines-bio/openpipeline"
docker_registry: "ghcr.io"
homepage: "https://openpipelines.bio"
documentation: "https://openpipelines.bio/fundamentals"
issue_tracker: "https://github.com/openpipelines-bio/openpipeline/issues"

View File

@@ -0,0 +1,48 @@
process {
// Default resources for components that hardly do any processing
memory = { 2.GB * task.attempt }
cpus = 1
// Retry for exit codes that have something to do with memory issues
errorStrategy = { task.exitStatus in 137..140 ? 'retry' : 'terminate' }
maxRetries = 3
// The memory a task is assinged increases with each attempt
// uncomment the line below and adjust the value to set a global upper limit on the memory.
// resourceLimits = [ memory: 240.Gb ]
// CPU resources
withLabel: singlecpu { cpus = 1 }
withLabel: lowcpu { cpus = 4 }
withLabel: midcpu { cpus = 10 }
withLabel: highcpu { cpus = 20 }
// Memory resources
withLabel: lowmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 4.GB * task.attempt } }
withLabel: midmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 25.GB * task.attempt } }
withLabel: highmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 50.GB * task.attempt } }
withLabel: veryhighmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 75.GB * task.attempt } }
// Disk space
withLabel: lowdisk {
disk = {process.disk ? process.disk : null}
}
withLabel: middisk {
disk = {process.disk ? process.disk : null}
}
withLabel: highdisk {
disk = {process.disk ? process.disk : null}
}
withLabel: veryhighdisk {
disk = {process.disk ? process.disk : null}
}
// NOTE: The above labels intentionally do not have an effect by default.
// The user should set the disk space requirements by adding the following
// to the compute environment:
//
// withLabel: lowdisk { disk = { 20.GB * task.attempt } }
// withLabel: middisk { disk = { 100.GB * task.attempt } }
// withLabel: highdisk { disk = { 200.GB * task.attempt } }
// withLabel: veryhighdisk { disk = { 500.GB * task.attempt } }
}

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,440 @@
name: "svm_annotation"
namespace: "annotate"
version: "main"
authors:
- name: "Jakub Majercik"
roles:
- "author"
info:
role: "Contributor"
links:
email: "jakub@data-intuitive.com"
github: "jakubmajercik"
linkedin: "jakubmajercik"
organizations:
- name: "Data Intuitive"
href: "https://www.data-intuitive.com"
role: "Bioinformatics Engineer"
argument_groups:
- name: "Inputs"
description: "Input dataset (query) arguments"
arguments:
- type: "file"
name: "--input"
description: "The input (query) data to be labeled. Should be a .h5mu file."
info: null
example:
- "input.h5mu"
must_exist: true
create_parent: true
required: true
direction: "input"
multiple: false
multiple_sep: ";"
- type: "string"
name: "--modality"
description: "Which modality to process."
info: null
default:
- "rna"
required: false
direction: "input"
multiple: false
multiple_sep: ";"
- type: "string"
name: "--input_layer"
description: "The layer in the input data to be used for cell type annotation\
\ if .X is not to be used."
info: null
required: false
direction: "input"
multiple: false
multiple_sep: ";"
- type: "string"
name: "--input_var_gene_names"
description: "The name of the adata var column in the input data containing gene\
\ names; when no gene_name_layer is provided, the var index will be used.\n"
info: null
required: false
direction: "input"
multiple: false
multiple_sep: ";"
- type: "integer"
name: "--input_reference_gene_overlap"
description: "The minimum number of genes present in both the reference and query\
\ datasets.\n"
info: null
default:
- 100
required: false
min: 1
direction: "input"
multiple: false
multiple_sep: ";"
- name: "Reference"
description: "Arguments related to the reference dataset."
arguments:
- type: "file"
name: "--reference"
description: "The reference data to train the CellTypist classifiers on. Only\
\ required if a pre-trained --model is not provided."
info: null
example:
- "reference.h5mu"
must_exist: true
create_parent: true
required: false
direction: "input"
multiple: false
multiple_sep: ";"
- type: "string"
name: "--reference_layer"
description: "The layer in the reference data to be used for cell type annotation\
\ if .X is not to be used. Data are expected to be processed in the same way\
\ as the --input query dataset."
info: null
required: false
direction: "input"
multiple: false
multiple_sep: ";"
- type: "string"
name: "--reference_obs_target"
description: "Key in .obs attribute of reference modality with cell-type information.\n"
info: null
required: true
direction: "input"
multiple: false
multiple_sep: ";"
- type: "string"
name: "--reference_var_gene_names"
description: "The name of the adata var column in the reference data containing\
\ gene names; when no gene_name_layer is provided, the var index will be used.\n"
info: null
required: false
direction: "input"
multiple: false
multiple_sep: ";"
- type: "string"
name: "--reference_var_input"
description: ".var column containing highly variable genes. By default, do not\
\ subset genes.\n"
info: null
required: false
direction: "input"
multiple: false
multiple_sep: ";"
- name: "Outputs"
description: "Output arguments."
arguments:
- type: "file"
name: "--output"
description: "Output h5mu file."
info: null
example:
- "output.h5mu"
must_exist: true
create_parent: true
required: false
direction: "output"
multiple: false
multiple_sep: ";"
- type: "string"
name: "--output_obs_prediction"
description: "In which `.obs` slots to store the predicted information.\n"
info: null
default:
- "svm_pred"
required: false
direction: "input"
multiple: false
multiple_sep: ";"
- type: "string"
name: "--output_obs_probability"
description: "In which `.obs` slots to store the probability of the predictions.\n"
info: null
default:
- "svm_probability"
required: false
direction: "input"
multiple: false
multiple_sep: ";"
- type: "string"
name: "--output_compression"
description: "Compression format to use for the output AnnData and/or Mudata objects.\n\
By default no compression is applied.\n"
info: null
example:
- "gzip"
required: false
choices:
- "gzip"
- "lzf"
direction: "input"
multiple: false
multiple_sep: ";"
- name: "Model arguments"
description: "Model arguments."
arguments:
- type: "file"
name: "--model"
description: "Pretrained model in pkl format. If not provided, the model will\
\ be trained on the reference data and --reference should be provided."
info: null
example:
- "pretrained_model.pkl"
must_exist: true
create_parent: true
required: false
direction: "input"
multiple: false
multiple_sep: ";"
- type: "boolean"
name: "--feature_selection"
description: "Whether to perform feature selection."
info: null
default:
- true
required: false
direction: "input"
multiple: false
multiple_sep: ";"
- type: "integer"
name: "--max_iter"
description: "Maximum number of iterations for the SVM."
info: null
default:
- 5000
required: false
min: 1
direction: "input"
multiple: false
multiple_sep: ";"
- type: "double"
name: "--c_reg"
description: "Regularization parameter for the SVM."
info: null
default:
- 1.0
required: false
min: 0.0
direction: "input"
multiple: false
multiple_sep: ";"
- type: "string"
name: "--class_weight"
description: "\"Class weights for the SVM. The `uniform` mode gives all classes\
\ a weight of one. \nThe `balanced` mode (default) uses the values of y to\
\ automatically adjust weights inversely \nproportional to class frequencies\
\ in the input data as n_samples / (n_classes * np.bincount(y))\"\n"
info: null
default:
- "balanced"
required: false
choices:
- "balanced"
- "uniform"
direction: "input"
multiple: false
multiple_sep: ";"
resources:
- type: "python_script"
path: "script.py"
is_executable: true
- type: "file"
path: "setup_logger.py"
- type: "file"
path: "cross_check_genes.py"
- type: "file"
path: "subset_vars.py"
- type: "file"
path: "set_var_index.py"
- type: "file"
path: "nextflow_labels.config"
dest: "nextflow_labels.config"
description: "Automated cell type annotation tool for scRNA-seq datasets on the basis\
\ of SVMs."
test_resources:
- type: "python_script"
path: "test.py"
is_executable: true
- type: "file"
path: "annotation_test_data"
- type: "file"
path: "pbmc_1k_protein_v3"
info: null
status: "enabled"
scope:
image: "public"
target: "public"
license: "MIT"
links:
repository: "https://github.com/openpipelines-bio/openpipeline"
docker_registry: "ghcr.io"
runners:
- type: "executable"
id: "executable"
docker_setup_strategy: "ifneedbepullelsecachedbuild"
- type: "nextflow"
id: "nextflow"
directives:
label:
- "highcpu"
- "highmem"
- "highdisk"
tag: "$id"
auto:
simplifyInput: true
simplifyOutput: false
transcript: false
publish: false
config:
labels:
mem1gb: "memory = 1000000000.B"
mem2gb: "memory = 2000000000.B"
mem5gb: "memory = 5000000000.B"
mem10gb: "memory = 10000000000.B"
mem20gb: "memory = 20000000000.B"
mem50gb: "memory = 50000000000.B"
mem100gb: "memory = 100000000000.B"
mem200gb: "memory = 200000000000.B"
mem500gb: "memory = 500000000000.B"
mem1tb: "memory = 1000000000000.B"
mem2tb: "memory = 2000000000000.B"
mem5tb: "memory = 5000000000000.B"
mem10tb: "memory = 10000000000000.B"
mem20tb: "memory = 20000000000000.B"
mem50tb: "memory = 50000000000000.B"
mem100tb: "memory = 100000000000000.B"
mem200tb: "memory = 200000000000000.B"
mem500tb: "memory = 500000000000000.B"
mem1gib: "memory = 1073741824.B"
mem2gib: "memory = 2147483648.B"
mem4gib: "memory = 4294967296.B"
mem8gib: "memory = 8589934592.B"
mem16gib: "memory = 17179869184.B"
mem32gib: "memory = 34359738368.B"
mem64gib: "memory = 68719476736.B"
mem128gib: "memory = 137438953472.B"
mem256gib: "memory = 274877906944.B"
mem512gib: "memory = 549755813888.B"
mem1tib: "memory = 1099511627776.B"
mem2tib: "memory = 2199023255552.B"
mem4tib: "memory = 4398046511104.B"
mem8tib: "memory = 8796093022208.B"
mem16tib: "memory = 17592186044416.B"
mem32tib: "memory = 35184372088832.B"
mem64tib: "memory = 70368744177664.B"
mem128tib: "memory = 140737488355328.B"
mem256tib: "memory = 281474976710656.B"
mem512tib: "memory = 562949953421312.B"
cpu1: "cpus = 1"
cpu2: "cpus = 2"
cpu5: "cpus = 5"
cpu10: "cpus = 10"
cpu20: "cpus = 20"
cpu50: "cpus = 50"
cpu100: "cpus = 100"
cpu200: "cpus = 200"
cpu500: "cpus = 500"
cpu1000: "cpus = 1000"
script:
- "includeConfig(\"nextflow_labels.config\")"
debug: false
container: "docker"
engines:
- type: "docker"
id: "docker"
image: "python:3.12-slim"
target_registry: "images.viash-hub.com"
target_tag: "main"
namespace_separator: "/"
setup:
- type: "apt"
packages:
- "libhdf5-dev"
- "procps"
interactive: false
- type: "python"
user: false
packages:
- "scikit-learn==1.5.2"
upgrade: true
- type: "python"
user: false
packages:
- "anndata~=0.11.1"
- "mudata~=0.3.1"
script:
- "exec(\"try:\\n import awkward\\nexcept ModuleNotFoundError:\\n exit(0)\\\
nelse: exit(1)\")"
upgrade: true
test_setup:
- type: "apt"
packages:
- "git"
interactive: false
- type: "python"
user: false
packages:
- "viashpy==0.8.0"
github:
- "openpipelines-bio/core#subdirectory=packages/python/openpipeline_testutils"
upgrade: true
entrypoint: []
cmd: null
- type: "native"
id: "native"
build_info:
config: "src/annotate/svm_annotation/config.vsh.yaml"
runner: "executable"
engine: "docker|native"
output: "target/executable/annotate/svm_annotation"
executable: "target/executable/annotate/svm_annotation/svm_annotation"
viash_version: "0.9.4"
git_commit: "173327cc5670aa8bd5cf473827de80b602c90092"
git_remote: "https://github.com/openpipelines-bio/openpipeline"
git_tag: "0.2.0-2055-g173327cc"
package_config:
name: "openpipeline"
version: "main"
summary: "Best-practice workflows for single-cell multi-omics analyses.\n"
description: "OpenPipelines are extensible single cell analysis pipelines for reproducible\
\ and large-scale single cell processing using [Viash](https://viash.io) and [Nextflow](https://www.nextflow.io/).\n\
\nIn terms of workflows, the following has been made available, but keep in mind\
\ that\nindividual tools and functionality can be executed as standalone components\
\ as well.\n\n * Demultiplexing: conversion of raw sequencing data to FASTQ objects.\n\
\ * Ingestion: Read mapping and generating a count matrix.\n * Single sample\
\ processing: cell filtering and doublet detection.\n * Multisample processing:\
\ Count transformation, normalization, QC metric calulations.\n * Integration:\
\ Clustering, integration and batch correction using single and multimodal methods.\n\
\ * Downstream analysis workflows\n"
info:
test_resources:
- type: "s3"
path: "s3://openpipelines-data"
dest: "resources_test"
nextflow_labels_ci:
- path: "src/workflows/utils/labels_ci.config"
description: "Adds the correct memory and CPU labels when running on the Viash\
\ Hub CI."
viash_version: "0.9.4"
source: "src"
target: "target"
config_mods:
- ".resources += {path: '/src/workflows/utils/labels.config', dest: 'nextflow_labels.config'}\n\
.runners[.type == 'nextflow'].config.script := 'includeConfig(\"nextflow_labels.config\"\
)'\n"
- ".engines += { type: \"native\" }"
- ".engines[.type == 'docker'].target_registry := 'images.viash-hub.com'"
- ".engines[.type == 'docker'].target_tag := 'main'"
keywords:
- "single-cell"
- "multimodal"
license: "MIT"
organization: "vsh"
links:
repository: "https://github.com/openpipelines-bio/openpipeline"
docker_registry: "ghcr.io"
homepage: "https://openpipelines.bio"
documentation: "https://openpipelines.bio/fundamentals"
issue_tracker: "https://github.com/openpipelines-bio/openpipeline/issues"

View File

@@ -0,0 +1,26 @@
from typing import List
def cross_check_genes(
query_genes: List[str], reference_genes: List[str], min_gene_overlap: int = 100
) -> List[str]:
"""Cross check the overlap between two lists of genes
Parameters
----------
query_genes : List[str]
List of gene names
reference_genes : List[str]
List of gene names
Returns
-------
List[str]
List of overlapping genes
"""
common_ens_ids = list(set(reference_genes).intersection(set(query_genes)))
assert len(common_ens_ids) >= min_gene_overlap, (
f"The intersection of genes between the query and reference dataset is too small, expected at least {min_gene_overlap}."
)
return common_ens_ids

View File

@@ -0,0 +1,48 @@
process {
// Default resources for components that hardly do any processing
memory = { 2.GB * task.attempt }
cpus = 1
// Retry for exit codes that have something to do with memory issues
errorStrategy = { task.exitStatus in 137..140 ? 'retry' : 'terminate' }
maxRetries = 3
// The memory a task is assinged increases with each attempt
// uncomment the line below and adjust the value to set a global upper limit on the memory.
// resourceLimits = [ memory: 240.Gb ]
// CPU resources
withLabel: singlecpu { cpus = 1 }
withLabel: lowcpu { cpus = 4 }
withLabel: midcpu { cpus = 10 }
withLabel: highcpu { cpus = 20 }
// Memory resources
withLabel: lowmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 4.GB * task.attempt } }
withLabel: midmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 25.GB * task.attempt } }
withLabel: highmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 50.GB * task.attempt } }
withLabel: veryhighmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 75.GB * task.attempt } }
// Disk space
withLabel: lowdisk {
disk = {process.disk ? process.disk : null}
}
withLabel: middisk {
disk = {process.disk ? process.disk : null}
}
withLabel: highdisk {
disk = {process.disk ? process.disk : null}
}
withLabel: veryhighdisk {
disk = {process.disk ? process.disk : null}
}
// NOTE: The above labels intentionally do not have an effect by default.
// The user should set the disk space requirements by adding the following
// to the compute environment:
//
// withLabel: lowdisk { disk = { 20.GB * task.attempt } }
// withLabel: middisk { disk = { 100.GB * task.attempt } }
// withLabel: highdisk { disk = { 200.GB * task.attempt } }
// withLabel: veryhighdisk { disk = { 500.GB * task.attempt } }
}

View File

@@ -0,0 +1,24 @@
import anndata as ad
import re
def set_var_index(adata: ad.AnnData, var_name: str | None = None) -> ad.AnnData:
"""Sanitize gene names and set the index of the .var DataFrame.
Parameters
----------
adata : AnnData
Annotated data object
var_name : str | None
Name of the column in `adata.var` that contains the gene names, if None, the existing index will be sanitized but not replaced.
Returns
-------
AnnData
Copy of `adata` with sanitized and replaced index
"""
if var_name:
adata.var.index = [re.sub("\\.[0-9]+$", "", s) for s in adata.var[var_name]]
else:
adata.var.index = [re.sub("\\.[0-9]+$", "", s) for s in adata.var.index]
return adata

View File

@@ -0,0 +1,12 @@
def setup_logger():
import logging
from sys import stdout
logger = logging.getLogger()
logger.setLevel(logging.INFO)
console_handler = logging.StreamHandler(stdout)
logFormatter = logging.Formatter("%(asctime)s %(levelname)-8s %(message)s")
console_handler.setFormatter(logFormatter)
logger.addHandler(console_handler)
return logger

View File

@@ -0,0 +1,31 @@
def subset_vars(adata, subset_col):
"""Subset AnnData object on highly variable genes
Parameters
----------
adata : AnnData
Annotated data object
subset_col : str
Name of the boolean column in `adata.var` that contains the information if features should be used or not
Returns
-------
AnnData
Copy of `adata` with subsetted features
"""
if subset_col not in adata.var.columns:
raise ValueError(
f"Requested to use .var column '{subset_col}' as a selection of genes, but the column is not available."
)
if adata.var[subset_col].dtype == "boolean":
assert adata.var[subset_col].isna().sum() == 0, (
f"The .var column `{subset_col}` contains NaN values. Can not subset data."
)
adata.var[subset_col] = adata.var[subset_col].astype("bool")
assert adata.var[subset_col].dtype == "bool", (
f"Expected dtype of .var column '{subset_col}' to be `bool`, but found {adata.var[subset_col].dtype}. Can not subset data."
)
return adata[:, adata.var[subset_col]].copy()

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,305 @@
name: "leiden"
namespace: "cluster"
version: "main"
authors:
- name: "Dries De Maeyer"
roles:
- "maintainer"
info:
role: "Core Team Member"
links:
email: "ddemaeyer@gmail.com"
github: "ddemaeyer"
linkedin: "dries-de-maeyer-b46a814"
organizations:
- name: "Janssen Pharmaceuticals"
href: "https://www.janssen.com"
role: "Principal Scientist"
argument_groups:
- name: "Arguments"
arguments:
- type: "file"
name: "--input"
alternatives:
- "-i"
description: "Input file."
info: null
example:
- "input.h5mu"
must_exist: true
create_parent: true
required: true
direction: "input"
multiple: false
multiple_sep: ";"
- type: "string"
name: "--modality"
description: "Which modality from the input MuData file to process.\n"
info: null
default:
- "rna"
required: false
direction: "input"
multiple: false
multiple_sep: ";"
- type: "string"
name: "--obsp_connectivities"
description: "In which .obsp slot the neighbor connectivities can be found."
info: null
default:
- "connectivities"
required: false
direction: "input"
multiple: false
multiple_sep: ";"
- type: "file"
name: "--output"
alternatives:
- "-o"
description: "Output file."
info: null
example:
- "output.h5mu"
must_exist: true
create_parent: true
required: true
direction: "output"
multiple: false
multiple_sep: ";"
- type: "string"
name: "--obsm_name"
description: "Name of the .obsm key under which to add the cluster labels.\nThe\
\ name of the columns in the matrix will correspond to the resolutions.\n"
info: null
default:
- "leiden"
required: false
direction: "input"
multiple: false
multiple_sep: ";"
- type: "double"
name: "--resolution"
description: "A parameter value controlling the coarseness of the clustering.\
\ Higher values lead to more clusters.\nMultiple values will result in clustering\
\ being performed multiple times.\n"
info: null
default:
- 1.0
required: true
direction: "input"
multiple: true
multiple_sep: ";"
- type: "string"
name: "--output_compression"
description: "Compression format to use for the output AnnData and/or Mudata objects.\n\
By default no compression is applied.\n"
info: null
example:
- "gzip"
required: false
choices:
- "gzip"
- "lzf"
direction: "input"
multiple: false
multiple_sep: ";"
resources:
- type: "python_script"
path: "script.py"
is_executable: true
- type: "file"
path: "setup_logger.py"
- type: "file"
path: "compress_h5mu.py"
- type: "file"
path: "nextflow_labels.config"
dest: "nextflow_labels.config"
description: "Cluster cells using the [Leiden algorithm] [Traag18] implemented in\
\ the [Scanpy framework] [Wolf18]. \nLeiden is an improved version of the [Louvain\
\ algorithm] [Blondel08]. \nIt has been proposed for single-cell analysis by [Levine15]\
\ [Levine15]. \nThis requires having ran `neighbors/find_neighbors` or `neighbors/bbknn`\
\ first.\n\n[Blondel08]: Blondel et al. (2008), Fast unfolding of communities in\
\ large networks, J. Stat. Mech. \n[Levine15]: Levine et al. (2015), Data-Driven\
\ Phenotypic Dissection of AML Reveals Progenitor-like Cells that Correlate with\
\ Prognosis, Cell. \n[Traag18]: Traag et al. (2018), From Louvain to Leiden: guaranteeing\
\ well-connected communities arXiv. \n[Wolf18]: Wolf et al. (2018), Scanpy: large-scale\
\ single-cell gene expression data analysis, Genome Biology. \n"
test_resources:
- type: "python_script"
path: "test.py"
is_executable: true
- type: "file"
path: "pbmc_1k_protein_v3"
info: null
status: "enabled"
scope:
image: "public"
target: "public"
license: "MIT"
links:
repository: "https://github.com/openpipelines-bio/openpipeline"
docker_registry: "ghcr.io"
runners:
- type: "executable"
id: "executable"
docker_setup_strategy: "ifneedbepullelsecachedbuild"
- type: "nextflow"
id: "nextflow"
directives:
label:
- "highcpu"
- "midmem"
- "middisk"
tag: "$id"
auto:
simplifyInput: true
simplifyOutput: false
transcript: false
publish: false
config:
labels:
mem1gb: "memory = 1000000000.B"
mem2gb: "memory = 2000000000.B"
mem5gb: "memory = 5000000000.B"
mem10gb: "memory = 10000000000.B"
mem20gb: "memory = 20000000000.B"
mem50gb: "memory = 50000000000.B"
mem100gb: "memory = 100000000000.B"
mem200gb: "memory = 200000000000.B"
mem500gb: "memory = 500000000000.B"
mem1tb: "memory = 1000000000000.B"
mem2tb: "memory = 2000000000000.B"
mem5tb: "memory = 5000000000000.B"
mem10tb: "memory = 10000000000000.B"
mem20tb: "memory = 20000000000000.B"
mem50tb: "memory = 50000000000000.B"
mem100tb: "memory = 100000000000000.B"
mem200tb: "memory = 200000000000000.B"
mem500tb: "memory = 500000000000000.B"
mem1gib: "memory = 1073741824.B"
mem2gib: "memory = 2147483648.B"
mem4gib: "memory = 4294967296.B"
mem8gib: "memory = 8589934592.B"
mem16gib: "memory = 17179869184.B"
mem32gib: "memory = 34359738368.B"
mem64gib: "memory = 68719476736.B"
mem128gib: "memory = 137438953472.B"
mem256gib: "memory = 274877906944.B"
mem512gib: "memory = 549755813888.B"
mem1tib: "memory = 1099511627776.B"
mem2tib: "memory = 2199023255552.B"
mem4tib: "memory = 4398046511104.B"
mem8tib: "memory = 8796093022208.B"
mem16tib: "memory = 17592186044416.B"
mem32tib: "memory = 35184372088832.B"
mem64tib: "memory = 70368744177664.B"
mem128tib: "memory = 140737488355328.B"
mem256tib: "memory = 281474976710656.B"
mem512tib: "memory = 562949953421312.B"
cpu1: "cpus = 1"
cpu2: "cpus = 2"
cpu5: "cpus = 5"
cpu10: "cpus = 10"
cpu20: "cpus = 20"
cpu50: "cpus = 50"
cpu100: "cpus = 100"
cpu200: "cpus = 200"
cpu500: "cpus = 500"
cpu1000: "cpus = 1000"
script:
- "includeConfig(\"nextflow_labels.config\")"
debug: false
container: "docker"
engines:
- type: "docker"
id: "docker"
image: "python:3.13-slim"
target_registry: "images.viash-hub.com"
target_tag: "main"
namespace_separator: "/"
setup:
- type: "apt"
packages:
- "procps"
interactive: false
- type: "python"
user: false
packages:
- "anndata~=0.11.1"
- "mudata~=0.3.1"
- "scanpy~=1.10.4"
- "leidenalg~=0.10.0"
script:
- "exec(\"try:\\n import awkward\\nexcept ModuleNotFoundError:\\n exit(0)\\\
nelse: exit(1)\")"
upgrade: true
test_setup:
- type: "apt"
packages:
- "git"
interactive: false
- type: "python"
user: false
packages:
- "viashpy==0.8.0"
github:
- "openpipelines-bio/core#subdirectory=packages/python/openpipeline_testutils"
upgrade: true
entrypoint: []
cmd: null
- type: "native"
id: "native"
build_info:
config: "src/cluster/leiden/config.vsh.yaml"
runner: "executable"
engine: "docker|native"
output: "target/executable/cluster/leiden"
executable: "target/executable/cluster/leiden/leiden"
viash_version: "0.9.4"
git_commit: "173327cc5670aa8bd5cf473827de80b602c90092"
git_remote: "https://github.com/openpipelines-bio/openpipeline"
git_tag: "0.2.0-2055-g173327cc"
package_config:
name: "openpipeline"
version: "main"
summary: "Best-practice workflows for single-cell multi-omics analyses.\n"
description: "OpenPipelines are extensible single cell analysis pipelines for reproducible\
\ and large-scale single cell processing using [Viash](https://viash.io) and [Nextflow](https://www.nextflow.io/).\n\
\nIn terms of workflows, the following has been made available, but keep in mind\
\ that\nindividual tools and functionality can be executed as standalone components\
\ as well.\n\n * Demultiplexing: conversion of raw sequencing data to FASTQ objects.\n\
\ * Ingestion: Read mapping and generating a count matrix.\n * Single sample\
\ processing: cell filtering and doublet detection.\n * Multisample processing:\
\ Count transformation, normalization, QC metric calulations.\n * Integration:\
\ Clustering, integration and batch correction using single and multimodal methods.\n\
\ * Downstream analysis workflows\n"
info:
test_resources:
- type: "s3"
path: "s3://openpipelines-data"
dest: "resources_test"
nextflow_labels_ci:
- path: "src/workflows/utils/labels_ci.config"
description: "Adds the correct memory and CPU labels when running on the Viash\
\ Hub CI."
viash_version: "0.9.4"
source: "src"
target: "target"
config_mods:
- ".resources += {path: '/src/workflows/utils/labels.config', dest: 'nextflow_labels.config'}\n\
.runners[.type == 'nextflow'].config.script := 'includeConfig(\"nextflow_labels.config\"\
)'\n"
- ".engines += { type: \"native\" }"
- ".engines[.type == 'docker'].target_registry := 'images.viash-hub.com'"
- ".engines[.type == 'docker'].target_tag := 'main'"
keywords:
- "single-cell"
- "multimodal"
license: "MIT"
organization: "vsh"
links:
repository: "https://github.com/openpipelines-bio/openpipeline"
docker_registry: "ghcr.io"
homepage: "https://openpipelines.bio"
documentation: "https://openpipelines.bio/fundamentals"
issue_tracker: "https://github.com/openpipelines-bio/openpipeline/issues"

View File

@@ -0,0 +1,87 @@
import shutil
from anndata import AnnData
from mudata import write_h5ad
from h5py import File as H5File
from h5py import Group, Dataset
from pathlib import Path
from typing import Union, Literal
from functools import partial
def compress_h5mu(
input_path: Union[str, Path],
output_path: Union[str, Path],
compression: Union[Literal["gzip"], Literal["lzf"]],
):
input_path, output_path = str(input_path), str(output_path)
def copy_attributes(in_object, out_object):
for key, value in in_object.attrs.items():
out_object.attrs[key] = value
def visit_path(
output_h5: H5File,
compression: Union[Literal["gzip"], Literal["lzf"]],
name: str,
object: Union[Group, Dataset],
):
if isinstance(object, Group):
new_group = output_h5.create_group(name)
copy_attributes(object, new_group)
elif isinstance(object, Dataset):
# Compression only works for non-scalar Dataset objects
# Scalar objects dont have a shape defined
if not object.compression and object.shape not in [None, ()]:
new_dataset = output_h5.create_dataset(
name, data=object, compression=compression
)
copy_attributes(object, new_dataset)
else:
output_h5.copy(object, name)
else:
raise NotImplementedError(
f"Could not copy element {name}, "
f"type has not been implemented yet: {type(object)}"
)
with (
H5File(input_path, "r") as input_h5,
H5File(output_path, "w", userblock_size=512) as output_h5,
):
copy_attributes(input_h5, output_h5)
input_h5.visititems(partial(visit_path, output_h5, compression))
with open(input_path, "rb") as input_bytes:
# Mudata puts metadata like this in the first 512 bytes:
# MuData (format-version=0.1.0;creator=muon;creator-version=0.2.0)
# See mudata/_core/io.py, read_h5mu() function
starting_metadata = input_bytes.read(100)
# The metadata is padded with extra null bytes up until 512 bytes
truncate_location = starting_metadata.find(b"\x00")
starting_metadata = starting_metadata[:truncate_location]
with open(output_path, "br+") as f:
nbytes = f.write(starting_metadata)
f.write(b"\0" * (512 - nbytes))
def write_h5ad_to_h5mu_with_compression(
output_file: Union[str, Path],
h5mu: Union[str, Path],
modality_name: str,
modality_data: AnnData,
output_compression=None,
):
output_file = Path(output_file)
h5mu = Path(h5mu)
output_file_uncompressed = (
output_file.with_name(output_file.stem + "_uncompressed.h5mu")
if output_compression
else output_file
)
shutil.copyfile(h5mu, output_file_uncompressed)
write_h5ad(filename=output_file_uncompressed, mod=modality_name, data=modality_data)
if output_compression:
compress_h5mu(
output_file_uncompressed, output_file, compression=output_compression
)
output_file_uncompressed.unlink()

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,48 @@
process {
// Default resources for components that hardly do any processing
memory = { 2.GB * task.attempt }
cpus = 1
// Retry for exit codes that have something to do with memory issues
errorStrategy = { task.exitStatus in 137..140 ? 'retry' : 'terminate' }
maxRetries = 3
// The memory a task is assinged increases with each attempt
// uncomment the line below and adjust the value to set a global upper limit on the memory.
// resourceLimits = [ memory: 240.Gb ]
// CPU resources
withLabel: singlecpu { cpus = 1 }
withLabel: lowcpu { cpus = 4 }
withLabel: midcpu { cpus = 10 }
withLabel: highcpu { cpus = 20 }
// Memory resources
withLabel: lowmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 4.GB * task.attempt } }
withLabel: midmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 25.GB * task.attempt } }
withLabel: highmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 50.GB * task.attempt } }
withLabel: veryhighmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 75.GB * task.attempt } }
// Disk space
withLabel: lowdisk {
disk = {process.disk ? process.disk : null}
}
withLabel: middisk {
disk = {process.disk ? process.disk : null}
}
withLabel: highdisk {
disk = {process.disk ? process.disk : null}
}
withLabel: veryhighdisk {
disk = {process.disk ? process.disk : null}
}
// NOTE: The above labels intentionally do not have an effect by default.
// The user should set the disk space requirements by adding the following
// to the compute environment:
//
// withLabel: lowdisk { disk = { 20.GB * task.attempt } }
// withLabel: middisk { disk = { 100.GB * task.attempt } }
// withLabel: highdisk { disk = { 200.GB * task.attempt } }
// withLabel: veryhighdisk { disk = { 500.GB * task.attempt } }
}

View File

@@ -0,0 +1,12 @@
def setup_logger():
import logging
from sys import stdout
logger = logging.getLogger()
logger.setLevel(logging.INFO)
console_handler = logging.StreamHandler(stdout)
logFormatter = logging.Formatter("%(asctime)s %(levelname)-8s %(message)s")
console_handler.setFormatter(logFormatter)
logger.addHandler(console_handler)
return logger

View File

@@ -0,0 +1,245 @@
name: "compress_h5mu"
namespace: "compression"
version: "main"
authors:
- name: "Dries Schaumont"
roles:
- "maintainer"
info:
role: "Core Team Member"
links:
email: "dries@data-intuitive.com"
github: "DriesSchaumont"
orcid: "0000-0002-4389-0440"
linkedin: "dries-schaumont"
organizations:
- name: "Data Intuitive"
href: "https://www.data-intuitive.com"
role: "Data Scientist"
argument_groups:
- name: "Arguments"
arguments:
- type: "file"
name: "--input"
alternatives:
- "-i"
description: "Path to the input .h5mu."
info: null
example:
- "sample_path"
must_exist: true
create_parent: true
required: true
direction: "input"
multiple: false
multiple_sep: ";"
- type: "file"
name: "--output"
description: "location of output file."
info: null
must_exist: true
create_parent: true
required: true
direction: "output"
multiple: false
multiple_sep: ";"
- type: "string"
name: "--output_compression"
description: "Compression format to use for the output AnnData and/or Mudata objects.\n\
By default no compression is applied.\n"
info: null
example:
- "gzip"
required: false
choices:
- "gzip"
- "lzf"
direction: "input"
multiple: false
multiple_sep: ";"
resources:
- type: "python_script"
path: "script.py"
is_executable: true
- type: "file"
path: "compress_h5mu.py"
- type: "file"
path: "nextflow_labels.config"
dest: "nextflow_labels.config"
description: "Compress a MuData file. \n"
test_resources:
- type: "python_script"
path: "run_test.py"
is_executable: true
- type: "file"
path: "e18_mouse_brain_fresh_5k_filtered_feature_bc_matrix_subset_unique_obs.h5mu"
info: null
status: "enabled"
scope:
image: "public"
target: "public"
license: "MIT"
links:
repository: "https://github.com/openpipelines-bio/openpipeline"
docker_registry: "ghcr.io"
runners:
- type: "executable"
id: "executable"
docker_setup_strategy: "ifneedbepullelsecachedbuild"
- type: "nextflow"
id: "nextflow"
directives:
label:
- "singlecpu"
- "lowmem"
tag: "$id"
auto:
simplifyInput: true
simplifyOutput: false
transcript: false
publish: false
config:
labels:
mem1gb: "memory = 1000000000.B"
mem2gb: "memory = 2000000000.B"
mem5gb: "memory = 5000000000.B"
mem10gb: "memory = 10000000000.B"
mem20gb: "memory = 20000000000.B"
mem50gb: "memory = 50000000000.B"
mem100gb: "memory = 100000000000.B"
mem200gb: "memory = 200000000000.B"
mem500gb: "memory = 500000000000.B"
mem1tb: "memory = 1000000000000.B"
mem2tb: "memory = 2000000000000.B"
mem5tb: "memory = 5000000000000.B"
mem10tb: "memory = 10000000000000.B"
mem20tb: "memory = 20000000000000.B"
mem50tb: "memory = 50000000000000.B"
mem100tb: "memory = 100000000000000.B"
mem200tb: "memory = 200000000000000.B"
mem500tb: "memory = 500000000000000.B"
mem1gib: "memory = 1073741824.B"
mem2gib: "memory = 2147483648.B"
mem4gib: "memory = 4294967296.B"
mem8gib: "memory = 8589934592.B"
mem16gib: "memory = 17179869184.B"
mem32gib: "memory = 34359738368.B"
mem64gib: "memory = 68719476736.B"
mem128gib: "memory = 137438953472.B"
mem256gib: "memory = 274877906944.B"
mem512gib: "memory = 549755813888.B"
mem1tib: "memory = 1099511627776.B"
mem2tib: "memory = 2199023255552.B"
mem4tib: "memory = 4398046511104.B"
mem8tib: "memory = 8796093022208.B"
mem16tib: "memory = 17592186044416.B"
mem32tib: "memory = 35184372088832.B"
mem64tib: "memory = 70368744177664.B"
mem128tib: "memory = 140737488355328.B"
mem256tib: "memory = 281474976710656.B"
mem512tib: "memory = 562949953421312.B"
cpu1: "cpus = 1"
cpu2: "cpus = 2"
cpu5: "cpus = 5"
cpu10: "cpus = 10"
cpu20: "cpus = 20"
cpu50: "cpus = 50"
cpu100: "cpus = 100"
cpu200: "cpus = 200"
cpu500: "cpus = 500"
cpu1000: "cpus = 1000"
script:
- "includeConfig(\"nextflow_labels.config\")"
debug: false
container: "docker"
engines:
- type: "docker"
id: "docker"
image: "python:3.10-slim"
target_registry: "images.viash-hub.com"
target_tag: "main"
namespace_separator: "/"
setup:
- type: "apt"
packages:
- "procps"
interactive: false
- type: "python"
user: false
packages:
- "anndata~=0.11.1"
- "mudata~=0.3.1"
script:
- "exec(\"try:\\n import awkward\\nexcept ModuleNotFoundError:\\n exit(0)\\\
nelse: exit(1)\")"
upgrade: true
test_setup:
- type: "apt"
packages:
- "git"
interactive: false
- type: "python"
user: false
packages:
- "viashpy==0.8.0"
github:
- "openpipelines-bio/core#subdirectory=packages/python/openpipeline_testutils"
upgrade: true
entrypoint: []
cmd: null
- type: "native"
id: "native"
build_info:
config: "src/compression/compress_h5mu/config.vsh.yaml"
runner: "executable"
engine: "docker|native"
output: "target/executable/compression/compress_h5mu"
executable: "target/executable/compression/compress_h5mu/compress_h5mu"
viash_version: "0.9.4"
git_commit: "173327cc5670aa8bd5cf473827de80b602c90092"
git_remote: "https://github.com/openpipelines-bio/openpipeline"
git_tag: "0.2.0-2055-g173327cc"
package_config:
name: "openpipeline"
version: "main"
summary: "Best-practice workflows for single-cell multi-omics analyses.\n"
description: "OpenPipelines are extensible single cell analysis pipelines for reproducible\
\ and large-scale single cell processing using [Viash](https://viash.io) and [Nextflow](https://www.nextflow.io/).\n\
\nIn terms of workflows, the following has been made available, but keep in mind\
\ that\nindividual tools and functionality can be executed as standalone components\
\ as well.\n\n * Demultiplexing: conversion of raw sequencing data to FASTQ objects.\n\
\ * Ingestion: Read mapping and generating a count matrix.\n * Single sample\
\ processing: cell filtering and doublet detection.\n * Multisample processing:\
\ Count transformation, normalization, QC metric calulations.\n * Integration:\
\ Clustering, integration and batch correction using single and multimodal methods.\n\
\ * Downstream analysis workflows\n"
info:
test_resources:
- type: "s3"
path: "s3://openpipelines-data"
dest: "resources_test"
nextflow_labels_ci:
- path: "src/workflows/utils/labels_ci.config"
description: "Adds the correct memory and CPU labels when running on the Viash\
\ Hub CI."
viash_version: "0.9.4"
source: "src"
target: "target"
config_mods:
- ".resources += {path: '/src/workflows/utils/labels.config', dest: 'nextflow_labels.config'}\n\
.runners[.type == 'nextflow'].config.script := 'includeConfig(\"nextflow_labels.config\"\
)'\n"
- ".engines += { type: \"native\" }"
- ".engines[.type == 'docker'].target_registry := 'images.viash-hub.com'"
- ".engines[.type == 'docker'].target_tag := 'main'"
keywords:
- "single-cell"
- "multimodal"
license: "MIT"
organization: "vsh"
links:
repository: "https://github.com/openpipelines-bio/openpipeline"
docker_registry: "ghcr.io"
homepage: "https://openpipelines.bio"
documentation: "https://openpipelines.bio/fundamentals"
issue_tracker: "https://github.com/openpipelines-bio/openpipeline/issues"

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,87 @@
import shutil
from anndata import AnnData
from mudata import write_h5ad
from h5py import File as H5File
from h5py import Group, Dataset
from pathlib import Path
from typing import Union, Literal
from functools import partial
def compress_h5mu(
input_path: Union[str, Path],
output_path: Union[str, Path],
compression: Union[Literal["gzip"], Literal["lzf"]],
):
input_path, output_path = str(input_path), str(output_path)
def copy_attributes(in_object, out_object):
for key, value in in_object.attrs.items():
out_object.attrs[key] = value
def visit_path(
output_h5: H5File,
compression: Union[Literal["gzip"], Literal["lzf"]],
name: str,
object: Union[Group, Dataset],
):
if isinstance(object, Group):
new_group = output_h5.create_group(name)
copy_attributes(object, new_group)
elif isinstance(object, Dataset):
# Compression only works for non-scalar Dataset objects
# Scalar objects dont have a shape defined
if not object.compression and object.shape not in [None, ()]:
new_dataset = output_h5.create_dataset(
name, data=object, compression=compression
)
copy_attributes(object, new_dataset)
else:
output_h5.copy(object, name)
else:
raise NotImplementedError(
f"Could not copy element {name}, "
f"type has not been implemented yet: {type(object)}"
)
with (
H5File(input_path, "r") as input_h5,
H5File(output_path, "w", userblock_size=512) as output_h5,
):
copy_attributes(input_h5, output_h5)
input_h5.visititems(partial(visit_path, output_h5, compression))
with open(input_path, "rb") as input_bytes:
# Mudata puts metadata like this in the first 512 bytes:
# MuData (format-version=0.1.0;creator=muon;creator-version=0.2.0)
# See mudata/_core/io.py, read_h5mu() function
starting_metadata = input_bytes.read(100)
# The metadata is padded with extra null bytes up until 512 bytes
truncate_location = starting_metadata.find(b"\x00")
starting_metadata = starting_metadata[:truncate_location]
with open(output_path, "br+") as f:
nbytes = f.write(starting_metadata)
f.write(b"\0" * (512 - nbytes))
def write_h5ad_to_h5mu_with_compression(
output_file: Union[str, Path],
h5mu: Union[str, Path],
modality_name: str,
modality_data: AnnData,
output_compression=None,
):
output_file = Path(output_file)
h5mu = Path(h5mu)
output_file_uncompressed = (
output_file.with_name(output_file.stem + "_uncompressed.h5mu")
if output_compression
else output_file
)
shutil.copyfile(h5mu, output_file_uncompressed)
write_h5ad(filename=output_file_uncompressed, mod=modality_name, data=modality_data)
if output_compression:
compress_h5mu(
output_file_uncompressed, output_file, compression=output_compression
)
output_file_uncompressed.unlink()

View File

@@ -0,0 +1,48 @@
process {
// Default resources for components that hardly do any processing
memory = { 2.GB * task.attempt }
cpus = 1
// Retry for exit codes that have something to do with memory issues
errorStrategy = { task.exitStatus in 137..140 ? 'retry' : 'terminate' }
maxRetries = 3
// The memory a task is assinged increases with each attempt
// uncomment the line below and adjust the value to set a global upper limit on the memory.
// resourceLimits = [ memory: 240.Gb ]
// CPU resources
withLabel: singlecpu { cpus = 1 }
withLabel: lowcpu { cpus = 4 }
withLabel: midcpu { cpus = 10 }
withLabel: highcpu { cpus = 20 }
// Memory resources
withLabel: lowmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 4.GB * task.attempt } }
withLabel: midmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 25.GB * task.attempt } }
withLabel: highmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 50.GB * task.attempt } }
withLabel: veryhighmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 75.GB * task.attempt } }
// Disk space
withLabel: lowdisk {
disk = {process.disk ? process.disk : null}
}
withLabel: middisk {
disk = {process.disk ? process.disk : null}
}
withLabel: highdisk {
disk = {process.disk ? process.disk : null}
}
withLabel: veryhighdisk {
disk = {process.disk ? process.disk : null}
}
// NOTE: The above labels intentionally do not have an effect by default.
// The user should set the disk space requirements by adding the following
// to the compute environment:
//
// withLabel: lowdisk { disk = { 20.GB * task.attempt } }
// withLabel: middisk { disk = { 100.GB * task.attempt } }
// withLabel: highdisk { disk = { 200.GB * task.attempt } }
// withLabel: veryhighdisk { disk = { 500.GB * task.attempt } }
}

View File

@@ -0,0 +1,219 @@
name: "tar_extract"
namespace: "compression"
version: "main"
argument_groups:
- name: "Arguments"
arguments:
- type: "file"
name: "--input"
alternatives:
- "-i"
description: "Input file"
info: null
example:
- "input.tar.gz"
must_exist: true
create_parent: true
required: true
direction: "input"
multiple: false
multiple_sep: ";"
- type: "file"
name: "--output"
alternatives:
- "-o"
description: "Folder to restore file(s) to."
info: null
example:
- "output_folder"
must_exist: true
create_parent: true
required: true
direction: "output"
multiple: false
multiple_sep: ";"
- type: "integer"
name: "--strip_components"
alternatives:
- "-s"
description: "Strip this amount of leading components from file names on extraction.\
\ For example, to extract only 'myfile.txt' from an archive containing the structure\
\ `this/goes/deep/myfile.txt', use 3 to strip 'this/goes/deep/'."
info: null
example:
- 1
required: false
direction: "input"
multiple: false
multiple_sep: ";"
- type: "string"
name: "--exclude"
alternatives:
- "-e"
description: "Prevents any file or member whose name matches the shell wildcard\
\ (pattern) from being extracted."
info: null
example:
- "docs/figures"
required: false
direction: "input"
multiple: false
multiple_sep: ";"
resources:
- type: "bash_script"
path: "script.sh"
is_executable: true
- type: "file"
path: "nextflow_labels.config"
dest: "nextflow_labels.config"
description: "Extract files from a tar archive"
test_resources:
- type: "bash_script"
path: "test.sh"
is_executable: true
- type: "file"
path: "LICENSE"
info: null
status: "deprecated"
scope:
image: "public"
target: "public"
license: "MIT"
links:
repository: "https://github.com/openpipelines-bio/openpipeline"
docker_registry: "ghcr.io"
runners:
- type: "executable"
id: "executable"
docker_setup_strategy: "ifneedbepullelsecachedbuild"
- type: "nextflow"
id: "nextflow"
directives:
label:
- "singlecpu"
- "lowmem"
tag: "$id"
auto:
simplifyInput: true
simplifyOutput: false
transcript: false
publish: false
config:
labels:
mem1gb: "memory = 1000000000.B"
mem2gb: "memory = 2000000000.B"
mem5gb: "memory = 5000000000.B"
mem10gb: "memory = 10000000000.B"
mem20gb: "memory = 20000000000.B"
mem50gb: "memory = 50000000000.B"
mem100gb: "memory = 100000000000.B"
mem200gb: "memory = 200000000000.B"
mem500gb: "memory = 500000000000.B"
mem1tb: "memory = 1000000000000.B"
mem2tb: "memory = 2000000000000.B"
mem5tb: "memory = 5000000000000.B"
mem10tb: "memory = 10000000000000.B"
mem20tb: "memory = 20000000000000.B"
mem50tb: "memory = 50000000000000.B"
mem100tb: "memory = 100000000000000.B"
mem200tb: "memory = 200000000000000.B"
mem500tb: "memory = 500000000000000.B"
mem1gib: "memory = 1073741824.B"
mem2gib: "memory = 2147483648.B"
mem4gib: "memory = 4294967296.B"
mem8gib: "memory = 8589934592.B"
mem16gib: "memory = 17179869184.B"
mem32gib: "memory = 34359738368.B"
mem64gib: "memory = 68719476736.B"
mem128gib: "memory = 137438953472.B"
mem256gib: "memory = 274877906944.B"
mem512gib: "memory = 549755813888.B"
mem1tib: "memory = 1099511627776.B"
mem2tib: "memory = 2199023255552.B"
mem4tib: "memory = 4398046511104.B"
mem8tib: "memory = 8796093022208.B"
mem16tib: "memory = 17592186044416.B"
mem32tib: "memory = 35184372088832.B"
mem64tib: "memory = 70368744177664.B"
mem128tib: "memory = 140737488355328.B"
mem256tib: "memory = 281474976710656.B"
mem512tib: "memory = 562949953421312.B"
cpu1: "cpus = 1"
cpu2: "cpus = 2"
cpu5: "cpus = 5"
cpu10: "cpus = 10"
cpu20: "cpus = 20"
cpu50: "cpus = 50"
cpu100: "cpus = 100"
cpu200: "cpus = 200"
cpu500: "cpus = 500"
cpu1000: "cpus = 1000"
script:
- "includeConfig(\"nextflow_labels.config\")"
debug: false
container: "docker"
engines:
- type: "docker"
id: "docker"
image: "ubuntu:latest"
target_registry: "images.viash-hub.com"
target_tag: "main"
namespace_separator: "/"
entrypoint: []
cmd: null
- type: "native"
id: "native"
build_info:
config: "src/compression/tar_extract/config.vsh.yaml"
runner: "executable"
engine: "docker|native"
output: "target/executable/compression/tar_extract"
executable: "target/executable/compression/tar_extract/tar_extract"
viash_version: "0.9.4"
git_commit: "173327cc5670aa8bd5cf473827de80b602c90092"
git_remote: "https://github.com/openpipelines-bio/openpipeline"
git_tag: "0.2.0-2055-g173327cc"
package_config:
name: "openpipeline"
version: "main"
summary: "Best-practice workflows for single-cell multi-omics analyses.\n"
description: "OpenPipelines are extensible single cell analysis pipelines for reproducible\
\ and large-scale single cell processing using [Viash](https://viash.io) and [Nextflow](https://www.nextflow.io/).\n\
\nIn terms of workflows, the following has been made available, but keep in mind\
\ that\nindividual tools and functionality can be executed as standalone components\
\ as well.\n\n * Demultiplexing: conversion of raw sequencing data to FASTQ objects.\n\
\ * Ingestion: Read mapping and generating a count matrix.\n * Single sample\
\ processing: cell filtering and doublet detection.\n * Multisample processing:\
\ Count transformation, normalization, QC metric calulations.\n * Integration:\
\ Clustering, integration and batch correction using single and multimodal methods.\n\
\ * Downstream analysis workflows\n"
info:
test_resources:
- type: "s3"
path: "s3://openpipelines-data"
dest: "resources_test"
nextflow_labels_ci:
- path: "src/workflows/utils/labels_ci.config"
description: "Adds the correct memory and CPU labels when running on the Viash\
\ Hub CI."
viash_version: "0.9.4"
source: "src"
target: "target"
config_mods:
- ".resources += {path: '/src/workflows/utils/labels.config', dest: 'nextflow_labels.config'}\n\
.runners[.type == 'nextflow'].config.script := 'includeConfig(\"nextflow_labels.config\"\
)'\n"
- ".engines += { type: \"native\" }"
- ".engines[.type == 'docker'].target_registry := 'images.viash-hub.com'"
- ".engines[.type == 'docker'].target_tag := 'main'"
keywords:
- "single-cell"
- "multimodal"
license: "MIT"
organization: "vsh"
links:
repository: "https://github.com/openpipelines-bio/openpipeline"
docker_registry: "ghcr.io"
homepage: "https://openpipelines.bio"
documentation: "https://openpipelines.bio/fundamentals"
issue_tracker: "https://github.com/openpipelines-bio/openpipeline/issues"

View File

@@ -0,0 +1,48 @@
process {
// Default resources for components that hardly do any processing
memory = { 2.GB * task.attempt }
cpus = 1
// Retry for exit codes that have something to do with memory issues
errorStrategy = { task.exitStatus in 137..140 ? 'retry' : 'terminate' }
maxRetries = 3
// The memory a task is assinged increases with each attempt
// uncomment the line below and adjust the value to set a global upper limit on the memory.
// resourceLimits = [ memory: 240.Gb ]
// CPU resources
withLabel: singlecpu { cpus = 1 }
withLabel: lowcpu { cpus = 4 }
withLabel: midcpu { cpus = 10 }
withLabel: highcpu { cpus = 20 }
// Memory resources
withLabel: lowmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 4.GB * task.attempt } }
withLabel: midmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 25.GB * task.attempt } }
withLabel: highmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 50.GB * task.attempt } }
withLabel: veryhighmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 75.GB * task.attempt } }
// Disk space
withLabel: lowdisk {
disk = {process.disk ? process.disk : null}
}
withLabel: middisk {
disk = {process.disk ? process.disk : null}
}
withLabel: highdisk {
disk = {process.disk ? process.disk : null}
}
withLabel: veryhighdisk {
disk = {process.disk ? process.disk : null}
}
// NOTE: The above labels intentionally do not have an effect by default.
// The user should set the disk space requirements by adding the following
// to the compute environment:
//
// withLabel: lowdisk { disk = { 20.GB * task.attempt } }
// withLabel: middisk { disk = { 100.GB * task.attempt } }
// withLabel: highdisk { disk = { 200.GB * task.attempt } }
// withLabel: veryhighdisk { disk = { 500.GB * task.attempt } }
}

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,352 @@
name: "from_10xh5_to_h5mu"
namespace: "convert"
version: "main"
authors:
- name: "Robrecht Cannoodt"
roles:
- "maintainer"
info:
role: "Core Team Member"
links:
email: "robrecht@data-intuitive.com"
github: "rcannood"
orcid: "0000-0003-3641-729X"
linkedin: "robrechtcannoodt"
organizations:
- name: "Data Intuitive"
href: "https://www.data-intuitive.com"
role: "Data Science Engineer"
- name: "Open Problems"
href: "https://openproblems.bio"
role: "Core Member"
argument_groups:
- name: "Inputs"
arguments:
- type: "file"
name: "--input"
alternatives:
- "-i"
description: "A 10x h5 file as generated by Cell Ranger."
info: null
example:
- "raw_feature_bc_matrix.h5"
must_exist: true
create_parent: true
required: true
direction: "input"
multiple: false
multiple_sep: ";"
- type: "file"
name: "--input_metrics_summary"
description: "A metrics summary csv file as generated by Cell Ranger."
info: null
example:
- "metrics_cellranger.h5"
must_exist: true
create_parent: true
required: false
direction: "input"
multiple: false
multiple_sep: ";"
- name: "Outputs"
arguments:
- type: "file"
name: "--output"
alternatives:
- "-o"
description: "Output h5mu file."
info:
slots:
mod:
- name: "rna"
required: true
description: "Gene expression counts."
slots:
var:
- name: "gene_symbol"
type: "string"
description: "Identification of the gene."
required: true
- name: "feature_types"
type: "string"
description: "The full name of the modality."
required: true
- name: "genome"
type: "string"
description: "Reference that was used to generate the data."
required: true
- name: "prot"
required: false
description: "Protein abundancy"
slots:
var:
- name: "gene_symbol"
type: "string"
description: "Identification of the gene."
required: true
- name: "feature_types"
type: "string"
description: "The full name of the modality."
required: true
- name: "genome"
type: "string"
description: "Reference that was used to generate the data."
required: true
- name: "vdj"
required: false
description: "VDJ transcript counts"
slots:
var:
- name: "gene_symbol"
type: "string"
description: "Identification of the gene."
required: true
- name: "feature_types"
type: "string"
description: "The full name of the modality."
required: true
- name: "genome"
type: "string"
description: "Reference that was used to generate the data."
required: true
example:
- "output.h5mu"
must_exist: true
create_parent: true
required: false
direction: "output"
multiple: false
multiple_sep: ";"
- type: "string"
name: "--uns_metrics"
description: "Name of the .uns slot under which to QC metrics (if any)."
info: null
default:
- "metrics_cellranger"
required: false
direction: "input"
multiple: false
multiple_sep: ";"
- type: "string"
name: "--output_compression"
description: "Compression format to use for the output AnnData and/or Mudata objects.\n\
By default no compression is applied.\n"
info: null
example:
- "gzip"
required: false
choices:
- "gzip"
- "lzf"
direction: "input"
multiple: false
multiple_sep: ";"
- name: "Arguments"
arguments:
- type: "integer"
name: "--min_genes"
description: "Minimum number of counts required for a cell to pass filtering."
info: null
example:
- 100
required: false
direction: "input"
multiple: false
multiple_sep: ";"
- type: "integer"
name: "--min_counts"
description: "Minimum number of genes expressed required for a cell to pass filtering."
info: null
example:
- 1000
required: false
direction: "input"
multiple: false
multiple_sep: ";"
resources:
- type: "python_script"
path: "script.py"
is_executable: true
- type: "file"
path: "setup_logger.py"
- type: "file"
path: "nextflow_labels.config"
dest: "nextflow_labels.config"
description: "Converts a 10x h5 into an h5mu file.\n"
test_resources:
- type: "python_script"
path: "test.py"
is_executable: true
- type: "file"
path: "pbmc_1k_protein_v3"
info: null
status: "enabled"
scope:
image: "public"
target: "public"
license: "MIT"
links:
repository: "https://github.com/openpipelines-bio/openpipeline"
docker_registry: "ghcr.io"
runners:
- type: "executable"
id: "executable"
docker_setup_strategy: "ifneedbepullelsecachedbuild"
- type: "nextflow"
id: "nextflow"
directives:
label:
- "lowmem"
- "singlecpu"
tag: "$id"
auto:
simplifyInput: true
simplifyOutput: false
transcript: false
publish: false
config:
labels:
mem1gb: "memory = 1000000000.B"
mem2gb: "memory = 2000000000.B"
mem5gb: "memory = 5000000000.B"
mem10gb: "memory = 10000000000.B"
mem20gb: "memory = 20000000000.B"
mem50gb: "memory = 50000000000.B"
mem100gb: "memory = 100000000000.B"
mem200gb: "memory = 200000000000.B"
mem500gb: "memory = 500000000000.B"
mem1tb: "memory = 1000000000000.B"
mem2tb: "memory = 2000000000000.B"
mem5tb: "memory = 5000000000000.B"
mem10tb: "memory = 10000000000000.B"
mem20tb: "memory = 20000000000000.B"
mem50tb: "memory = 50000000000000.B"
mem100tb: "memory = 100000000000000.B"
mem200tb: "memory = 200000000000000.B"
mem500tb: "memory = 500000000000000.B"
mem1gib: "memory = 1073741824.B"
mem2gib: "memory = 2147483648.B"
mem4gib: "memory = 4294967296.B"
mem8gib: "memory = 8589934592.B"
mem16gib: "memory = 17179869184.B"
mem32gib: "memory = 34359738368.B"
mem64gib: "memory = 68719476736.B"
mem128gib: "memory = 137438953472.B"
mem256gib: "memory = 274877906944.B"
mem512gib: "memory = 549755813888.B"
mem1tib: "memory = 1099511627776.B"
mem2tib: "memory = 2199023255552.B"
mem4tib: "memory = 4398046511104.B"
mem8tib: "memory = 8796093022208.B"
mem16tib: "memory = 17592186044416.B"
mem32tib: "memory = 35184372088832.B"
mem64tib: "memory = 70368744177664.B"
mem128tib: "memory = 140737488355328.B"
mem256tib: "memory = 281474976710656.B"
mem512tib: "memory = 562949953421312.B"
cpu1: "cpus = 1"
cpu2: "cpus = 2"
cpu5: "cpus = 5"
cpu10: "cpus = 10"
cpu20: "cpus = 20"
cpu50: "cpus = 50"
cpu100: "cpus = 100"
cpu200: "cpus = 200"
cpu500: "cpus = 500"
cpu1000: "cpus = 1000"
script:
- "includeConfig(\"nextflow_labels.config\")"
debug: false
container: "docker"
engines:
- type: "docker"
id: "docker"
image: "python:3.12-slim"
target_registry: "images.viash-hub.com"
target_tag: "main"
namespace_separator: "/"
setup:
- type: "apt"
packages:
- "procps"
interactive: false
- type: "python"
user: false
packages:
- "anndata~=0.11.1"
- "mudata~=0.3.1"
- "scanpy~=1.10.4"
script:
- "exec(\"try:\\n import awkward\\nexcept ModuleNotFoundError:\\n exit(0)\\\
nelse: exit(1)\")"
upgrade: true
test_setup:
- type: "apt"
packages:
- "git"
interactive: false
- type: "python"
user: false
packages:
- "viashpy==0.8.0"
github:
- "openpipelines-bio/core#subdirectory=packages/python/openpipeline_testutils"
upgrade: true
entrypoint: []
cmd: null
- type: "native"
id: "native"
build_info:
config: "src/convert/from_10xh5_to_h5mu/config.vsh.yaml"
runner: "executable"
engine: "docker|native"
output: "target/executable/convert/from_10xh5_to_h5mu"
executable: "target/executable/convert/from_10xh5_to_h5mu/from_10xh5_to_h5mu"
viash_version: "0.9.4"
git_commit: "173327cc5670aa8bd5cf473827de80b602c90092"
git_remote: "https://github.com/openpipelines-bio/openpipeline"
git_tag: "0.2.0-2055-g173327cc"
package_config:
name: "openpipeline"
version: "main"
summary: "Best-practice workflows for single-cell multi-omics analyses.\n"
description: "OpenPipelines are extensible single cell analysis pipelines for reproducible\
\ and large-scale single cell processing using [Viash](https://viash.io) and [Nextflow](https://www.nextflow.io/).\n\
\nIn terms of workflows, the following has been made available, but keep in mind\
\ that\nindividual tools and functionality can be executed as standalone components\
\ as well.\n\n * Demultiplexing: conversion of raw sequencing data to FASTQ objects.\n\
\ * Ingestion: Read mapping and generating a count matrix.\n * Single sample\
\ processing: cell filtering and doublet detection.\n * Multisample processing:\
\ Count transformation, normalization, QC metric calulations.\n * Integration:\
\ Clustering, integration and batch correction using single and multimodal methods.\n\
\ * Downstream analysis workflows\n"
info:
test_resources:
- type: "s3"
path: "s3://openpipelines-data"
dest: "resources_test"
nextflow_labels_ci:
- path: "src/workflows/utils/labels_ci.config"
description: "Adds the correct memory and CPU labels when running on the Viash\
\ Hub CI."
viash_version: "0.9.4"
source: "src"
target: "target"
config_mods:
- ".resources += {path: '/src/workflows/utils/labels.config', dest: 'nextflow_labels.config'}\n\
.runners[.type == 'nextflow'].config.script := 'includeConfig(\"nextflow_labels.config\"\
)'\n"
- ".engines += { type: \"native\" }"
- ".engines[.type == 'docker'].target_registry := 'images.viash-hub.com'"
- ".engines[.type == 'docker'].target_tag := 'main'"
keywords:
- "single-cell"
- "multimodal"
license: "MIT"
organization: "vsh"
links:
repository: "https://github.com/openpipelines-bio/openpipeline"
docker_registry: "ghcr.io"
homepage: "https://openpipelines.bio"
documentation: "https://openpipelines.bio/fundamentals"
issue_tracker: "https://github.com/openpipelines-bio/openpipeline/issues"

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,48 @@
process {
// Default resources for components that hardly do any processing
memory = { 2.GB * task.attempt }
cpus = 1
// Retry for exit codes that have something to do with memory issues
errorStrategy = { task.exitStatus in 137..140 ? 'retry' : 'terminate' }
maxRetries = 3
// The memory a task is assinged increases with each attempt
// uncomment the line below and adjust the value to set a global upper limit on the memory.
// resourceLimits = [ memory: 240.Gb ]
// CPU resources
withLabel: singlecpu { cpus = 1 }
withLabel: lowcpu { cpus = 4 }
withLabel: midcpu { cpus = 10 }
withLabel: highcpu { cpus = 20 }
// Memory resources
withLabel: lowmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 4.GB * task.attempt } }
withLabel: midmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 25.GB * task.attempt } }
withLabel: highmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 50.GB * task.attempt } }
withLabel: veryhighmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 75.GB * task.attempt } }
// Disk space
withLabel: lowdisk {
disk = {process.disk ? process.disk : null}
}
withLabel: middisk {
disk = {process.disk ? process.disk : null}
}
withLabel: highdisk {
disk = {process.disk ? process.disk : null}
}
withLabel: veryhighdisk {
disk = {process.disk ? process.disk : null}
}
// NOTE: The above labels intentionally do not have an effect by default.
// The user should set the disk space requirements by adding the following
// to the compute environment:
//
// withLabel: lowdisk { disk = { 20.GB * task.attempt } }
// withLabel: middisk { disk = { 100.GB * task.attempt } }
// withLabel: highdisk { disk = { 200.GB * task.attempt } }
// withLabel: veryhighdisk { disk = { 500.GB * task.attempt } }
}

View File

@@ -0,0 +1,12 @@
def setup_logger():
import logging
from sys import stdout
logger = logging.getLogger()
logger.setLevel(logging.INFO)
console_handler = logging.StreamHandler(stdout)
logFormatter = logging.Formatter("%(asctime)s %(levelname)-8s %(message)s")
console_handler.setFormatter(logFormatter)
logger.addHandler(console_handler)
return logger

View File

@@ -0,0 +1,253 @@
name: "from_10xmtx_to_h5mu"
namespace: "convert"
version: "main"
authors:
- name: "Robrecht Cannoodt"
roles:
- "maintainer"
info:
role: "Core Team Member"
links:
email: "robrecht@data-intuitive.com"
github: "rcannood"
orcid: "0000-0003-3641-729X"
linkedin: "robrechtcannoodt"
organizations:
- name: "Data Intuitive"
href: "https://www.data-intuitive.com"
role: "Data Science Engineer"
- name: "Open Problems"
href: "https://openproblems.bio"
role: "Core Member"
argument_groups:
- name: "Arguments"
arguments:
- type: "file"
name: "--input"
alternatives:
- "-i"
description: "Input mtx folder"
info: null
example:
- "input_dir_containing_gz_files"
must_exist: true
create_parent: true
required: true
direction: "input"
multiple: false
multiple_sep: ";"
- type: "file"
name: "--output"
alternatives:
- "-o"
description: "Output h5mu file."
info: null
example:
- "output.h5mu"
must_exist: true
create_parent: true
required: false
direction: "output"
multiple: false
multiple_sep: ";"
- type: "string"
name: "--output_compression"
description: "Compression format to use for the output AnnData and/or Mudata objects.\n\
By default no compression is applied.\n"
info: null
example:
- "gzip"
required: false
choices:
- "gzip"
- "lzf"
direction: "input"
multiple: false
multiple_sep: ";"
resources:
- type: "python_script"
path: "script.py"
is_executable: true
- type: "file"
path: "setup_logger.py"
- type: "file"
path: "nextflow_labels.config"
dest: "nextflow_labels.config"
description: "Converts a 10x mtx into an h5mu file.\n"
test_resources:
- type: "python_script"
path: "run_test.py"
is_executable: true
- type: "file"
path: "pbmc_1k_protein_v3"
info: null
status: "enabled"
scope:
image: "public"
target: "public"
license: "MIT"
links:
repository: "https://github.com/openpipelines-bio/openpipeline"
docker_registry: "ghcr.io"
runners:
- type: "executable"
id: "executable"
docker_setup_strategy: "ifneedbepullelsecachedbuild"
- type: "nextflow"
id: "nextflow"
directives:
label:
- "lowmem"
- "singlecpu"
tag: "$id"
auto:
simplifyInput: true
simplifyOutput: false
transcript: false
publish: false
config:
labels:
mem1gb: "memory = 1000000000.B"
mem2gb: "memory = 2000000000.B"
mem5gb: "memory = 5000000000.B"
mem10gb: "memory = 10000000000.B"
mem20gb: "memory = 20000000000.B"
mem50gb: "memory = 50000000000.B"
mem100gb: "memory = 100000000000.B"
mem200gb: "memory = 200000000000.B"
mem500gb: "memory = 500000000000.B"
mem1tb: "memory = 1000000000000.B"
mem2tb: "memory = 2000000000000.B"
mem5tb: "memory = 5000000000000.B"
mem10tb: "memory = 10000000000000.B"
mem20tb: "memory = 20000000000000.B"
mem50tb: "memory = 50000000000000.B"
mem100tb: "memory = 100000000000000.B"
mem200tb: "memory = 200000000000000.B"
mem500tb: "memory = 500000000000000.B"
mem1gib: "memory = 1073741824.B"
mem2gib: "memory = 2147483648.B"
mem4gib: "memory = 4294967296.B"
mem8gib: "memory = 8589934592.B"
mem16gib: "memory = 17179869184.B"
mem32gib: "memory = 34359738368.B"
mem64gib: "memory = 68719476736.B"
mem128gib: "memory = 137438953472.B"
mem256gib: "memory = 274877906944.B"
mem512gib: "memory = 549755813888.B"
mem1tib: "memory = 1099511627776.B"
mem2tib: "memory = 2199023255552.B"
mem4tib: "memory = 4398046511104.B"
mem8tib: "memory = 8796093022208.B"
mem16tib: "memory = 17592186044416.B"
mem32tib: "memory = 35184372088832.B"
mem64tib: "memory = 70368744177664.B"
mem128tib: "memory = 140737488355328.B"
mem256tib: "memory = 281474976710656.B"
mem512tib: "memory = 562949953421312.B"
cpu1: "cpus = 1"
cpu2: "cpus = 2"
cpu5: "cpus = 5"
cpu10: "cpus = 10"
cpu20: "cpus = 20"
cpu50: "cpus = 50"
cpu100: "cpus = 100"
cpu200: "cpus = 200"
cpu500: "cpus = 500"
cpu1000: "cpus = 1000"
script:
- "includeConfig(\"nextflow_labels.config\")"
debug: false
container: "docker"
engines:
- type: "docker"
id: "docker"
image: "python:3.11-slim"
target_registry: "images.viash-hub.com"
target_tag: "main"
namespace_separator: "/"
setup:
- type: "apt"
packages:
- "procps"
interactive: false
- type: "python"
user: false
packages:
- "anndata~=0.11.1"
- "mudata~=0.3.1"
- "scanpy~=1.10.4"
script:
- "exec(\"try:\\n import awkward\\nexcept ModuleNotFoundError:\\n exit(0)\\\
nelse: exit(1)\")"
upgrade: true
test_setup:
- type: "apt"
packages:
- "git"
interactive: false
- type: "python"
user: false
packages:
- "viashpy==0.8.0"
github:
- "openpipelines-bio/core#subdirectory=packages/python/openpipeline_testutils"
upgrade: true
entrypoint: []
cmd: null
- type: "native"
id: "native"
build_info:
config: "src/convert/from_10xmtx_to_h5mu/config.vsh.yaml"
runner: "executable"
engine: "docker|native"
output: "target/executable/convert/from_10xmtx_to_h5mu"
executable: "target/executable/convert/from_10xmtx_to_h5mu/from_10xmtx_to_h5mu"
viash_version: "0.9.4"
git_commit: "173327cc5670aa8bd5cf473827de80b602c90092"
git_remote: "https://github.com/openpipelines-bio/openpipeline"
git_tag: "0.2.0-2055-g173327cc"
package_config:
name: "openpipeline"
version: "main"
summary: "Best-practice workflows for single-cell multi-omics analyses.\n"
description: "OpenPipelines are extensible single cell analysis pipelines for reproducible\
\ and large-scale single cell processing using [Viash](https://viash.io) and [Nextflow](https://www.nextflow.io/).\n\
\nIn terms of workflows, the following has been made available, but keep in mind\
\ that\nindividual tools and functionality can be executed as standalone components\
\ as well.\n\n * Demultiplexing: conversion of raw sequencing data to FASTQ objects.\n\
\ * Ingestion: Read mapping and generating a count matrix.\n * Single sample\
\ processing: cell filtering and doublet detection.\n * Multisample processing:\
\ Count transformation, normalization, QC metric calulations.\n * Integration:\
\ Clustering, integration and batch correction using single and multimodal methods.\n\
\ * Downstream analysis workflows\n"
info:
test_resources:
- type: "s3"
path: "s3://openpipelines-data"
dest: "resources_test"
nextflow_labels_ci:
- path: "src/workflows/utils/labels_ci.config"
description: "Adds the correct memory and CPU labels when running on the Viash\
\ Hub CI."
viash_version: "0.9.4"
source: "src"
target: "target"
config_mods:
- ".resources += {path: '/src/workflows/utils/labels.config', dest: 'nextflow_labels.config'}\n\
.runners[.type == 'nextflow'].config.script := 'includeConfig(\"nextflow_labels.config\"\
)'\n"
- ".engines += { type: \"native\" }"
- ".engines[.type == 'docker'].target_registry := 'images.viash-hub.com'"
- ".engines[.type == 'docker'].target_tag := 'main'"
keywords:
- "single-cell"
- "multimodal"
license: "MIT"
organization: "vsh"
links:
repository: "https://github.com/openpipelines-bio/openpipeline"
docker_registry: "ghcr.io"
homepage: "https://openpipelines.bio"
documentation: "https://openpipelines.bio/fundamentals"
issue_tracker: "https://github.com/openpipelines-bio/openpipeline/issues"

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,48 @@
process {
// Default resources for components that hardly do any processing
memory = { 2.GB * task.attempt }
cpus = 1
// Retry for exit codes that have something to do with memory issues
errorStrategy = { task.exitStatus in 137..140 ? 'retry' : 'terminate' }
maxRetries = 3
// The memory a task is assinged increases with each attempt
// uncomment the line below and adjust the value to set a global upper limit on the memory.
// resourceLimits = [ memory: 240.Gb ]
// CPU resources
withLabel: singlecpu { cpus = 1 }
withLabel: lowcpu { cpus = 4 }
withLabel: midcpu { cpus = 10 }
withLabel: highcpu { cpus = 20 }
// Memory resources
withLabel: lowmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 4.GB * task.attempt } }
withLabel: midmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 25.GB * task.attempt } }
withLabel: highmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 50.GB * task.attempt } }
withLabel: veryhighmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 75.GB * task.attempt } }
// Disk space
withLabel: lowdisk {
disk = {process.disk ? process.disk : null}
}
withLabel: middisk {
disk = {process.disk ? process.disk : null}
}
withLabel: highdisk {
disk = {process.disk ? process.disk : null}
}
withLabel: veryhighdisk {
disk = {process.disk ? process.disk : null}
}
// NOTE: The above labels intentionally do not have an effect by default.
// The user should set the disk space requirements by adding the following
// to the compute environment:
//
// withLabel: lowdisk { disk = { 20.GB * task.attempt } }
// withLabel: middisk { disk = { 100.GB * task.attempt } }
// withLabel: highdisk { disk = { 200.GB * task.attempt } }
// withLabel: veryhighdisk { disk = { 500.GB * task.attempt } }
}

View File

@@ -0,0 +1,12 @@
def setup_logger():
import logging
from sys import stdout
logger = logging.getLogger()
logger.setLevel(logging.INFO)
console_handler = logging.StreamHandler(stdout)
logFormatter = logging.Formatter("%(asctime)s %(levelname)-8s %(message)s")
console_handler.setFormatter(logFormatter)
logger.addHandler(console_handler)
return logger

View File

@@ -0,0 +1,228 @@
name: "from_bd_to_10x_molecular_barcode_tags"
namespace: "convert"
version: "main"
authors:
- name: "Dries Schaumont"
roles:
- "maintainer"
info:
role: "Core Team Member"
links:
email: "dries@data-intuitive.com"
github: "DriesSchaumont"
orcid: "0000-0002-4389-0440"
linkedin: "dries-schaumont"
organizations:
- name: "Data Intuitive"
href: "https://www.data-intuitive.com"
role: "Data Scientist"
argument_groups:
- name: "Arguments"
arguments:
- type: "file"
name: "--input"
alternatives:
- "-i"
description: "Input SAM or BAM file."
info: null
example:
- "input.bam"
must_exist: true
create_parent: true
required: true
direction: "input"
multiple: false
multiple_sep: ";"
- type: "file"
name: "--output"
alternatives:
- "-o"
description: "Output alignment file."
info: null
example:
- "output.sam"
must_exist: true
create_parent: true
required: false
direction: "output"
multiple: false
multiple_sep: ";"
- type: "boolean_true"
name: "--bam"
description: "Output a BAM file."
info: null
direction: "input"
- type: "integer"
name: "--threads"
alternatives:
- "-t"
description: "Number of threads"
info: null
required: false
direction: "input"
multiple: false
multiple_sep: ";"
resources:
- type: "bash_script"
path: "script.sh"
is_executable: true
- type: "file"
path: "nextflow_labels.config"
dest: "nextflow_labels.config"
description: "Convert the molecular barcode sequence SAM tag from BD format (MA) to\
\ 10X format (UB).\n"
test_resources:
- type: "bash_script"
path: "run_test.sh"
is_executable: true
- type: "file"
path: "output_raw"
info: null
status: "enabled"
scope:
image: "public"
target: "public"
license: "MIT"
links:
repository: "https://github.com/openpipelines-bio/openpipeline"
docker_registry: "ghcr.io"
runners:
- type: "executable"
id: "executable"
docker_setup_strategy: "ifneedbepullelsecachedbuild"
- type: "nextflow"
id: "nextflow"
directives:
label:
- "lowmem"
- "singlecpu"
tag: "$id"
auto:
simplifyInput: true
simplifyOutput: false
transcript: false
publish: false
config:
labels:
mem1gb: "memory = 1000000000.B"
mem2gb: "memory = 2000000000.B"
mem5gb: "memory = 5000000000.B"
mem10gb: "memory = 10000000000.B"
mem20gb: "memory = 20000000000.B"
mem50gb: "memory = 50000000000.B"
mem100gb: "memory = 100000000000.B"
mem200gb: "memory = 200000000000.B"
mem500gb: "memory = 500000000000.B"
mem1tb: "memory = 1000000000000.B"
mem2tb: "memory = 2000000000000.B"
mem5tb: "memory = 5000000000000.B"
mem10tb: "memory = 10000000000000.B"
mem20tb: "memory = 20000000000000.B"
mem50tb: "memory = 50000000000000.B"
mem100tb: "memory = 100000000000000.B"
mem200tb: "memory = 200000000000000.B"
mem500tb: "memory = 500000000000000.B"
mem1gib: "memory = 1073741824.B"
mem2gib: "memory = 2147483648.B"
mem4gib: "memory = 4294967296.B"
mem8gib: "memory = 8589934592.B"
mem16gib: "memory = 17179869184.B"
mem32gib: "memory = 34359738368.B"
mem64gib: "memory = 68719476736.B"
mem128gib: "memory = 137438953472.B"
mem256gib: "memory = 274877906944.B"
mem512gib: "memory = 549755813888.B"
mem1tib: "memory = 1099511627776.B"
mem2tib: "memory = 2199023255552.B"
mem4tib: "memory = 4398046511104.B"
mem8tib: "memory = 8796093022208.B"
mem16tib: "memory = 17592186044416.B"
mem32tib: "memory = 35184372088832.B"
mem64tib: "memory = 70368744177664.B"
mem128tib: "memory = 140737488355328.B"
mem256tib: "memory = 281474976710656.B"
mem512tib: "memory = 562949953421312.B"
cpu1: "cpus = 1"
cpu2: "cpus = 2"
cpu5: "cpus = 5"
cpu10: "cpus = 10"
cpu20: "cpus = 20"
cpu50: "cpus = 50"
cpu100: "cpus = 100"
cpu200: "cpus = 200"
cpu500: "cpus = 500"
cpu1000: "cpus = 1000"
script:
- "includeConfig(\"nextflow_labels.config\")"
debug: false
container: "docker"
engines:
- type: "docker"
id: "docker"
image: "ubuntu:latest"
target_registry: "images.viash-hub.com"
target_tag: "main"
namespace_separator: "/"
setup:
- type: "apt"
packages:
- "samtools"
interactive: false
entrypoint: []
cmd: null
- type: "native"
id: "native"
build_info:
config: "src/convert/from_bd_to_10x_molecular_barcode_tags/config.vsh.yaml"
runner: "executable"
engine: "docker|native"
output: "target/executable/convert/from_bd_to_10x_molecular_barcode_tags"
executable: "target/executable/convert/from_bd_to_10x_molecular_barcode_tags/from_bd_to_10x_molecular_barcode_tags"
viash_version: "0.9.4"
git_commit: "173327cc5670aa8bd5cf473827de80b602c90092"
git_remote: "https://github.com/openpipelines-bio/openpipeline"
git_tag: "0.2.0-2055-g173327cc"
package_config:
name: "openpipeline"
version: "main"
summary: "Best-practice workflows for single-cell multi-omics analyses.\n"
description: "OpenPipelines are extensible single cell analysis pipelines for reproducible\
\ and large-scale single cell processing using [Viash](https://viash.io) and [Nextflow](https://www.nextflow.io/).\n\
\nIn terms of workflows, the following has been made available, but keep in mind\
\ that\nindividual tools and functionality can be executed as standalone components\
\ as well.\n\n * Demultiplexing: conversion of raw sequencing data to FASTQ objects.\n\
\ * Ingestion: Read mapping and generating a count matrix.\n * Single sample\
\ processing: cell filtering and doublet detection.\n * Multisample processing:\
\ Count transformation, normalization, QC metric calulations.\n * Integration:\
\ Clustering, integration and batch correction using single and multimodal methods.\n\
\ * Downstream analysis workflows\n"
info:
test_resources:
- type: "s3"
path: "s3://openpipelines-data"
dest: "resources_test"
nextflow_labels_ci:
- path: "src/workflows/utils/labels_ci.config"
description: "Adds the correct memory and CPU labels when running on the Viash\
\ Hub CI."
viash_version: "0.9.4"
source: "src"
target: "target"
config_mods:
- ".resources += {path: '/src/workflows/utils/labels.config', dest: 'nextflow_labels.config'}\n\
.runners[.type == 'nextflow'].config.script := 'includeConfig(\"nextflow_labels.config\"\
)'\n"
- ".engines += { type: \"native\" }"
- ".engines[.type == 'docker'].target_registry := 'images.viash-hub.com'"
- ".engines[.type == 'docker'].target_tag := 'main'"
keywords:
- "single-cell"
- "multimodal"
license: "MIT"
organization: "vsh"
links:
repository: "https://github.com/openpipelines-bio/openpipeline"
docker_registry: "ghcr.io"
homepage: "https://openpipelines.bio"
documentation: "https://openpipelines.bio/fundamentals"
issue_tracker: "https://github.com/openpipelines-bio/openpipeline/issues"

View File

@@ -0,0 +1,48 @@
process {
// Default resources for components that hardly do any processing
memory = { 2.GB * task.attempt }
cpus = 1
// Retry for exit codes that have something to do with memory issues
errorStrategy = { task.exitStatus in 137..140 ? 'retry' : 'terminate' }
maxRetries = 3
// The memory a task is assinged increases with each attempt
// uncomment the line below and adjust the value to set a global upper limit on the memory.
// resourceLimits = [ memory: 240.Gb ]
// CPU resources
withLabel: singlecpu { cpus = 1 }
withLabel: lowcpu { cpus = 4 }
withLabel: midcpu { cpus = 10 }
withLabel: highcpu { cpus = 20 }
// Memory resources
withLabel: lowmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 4.GB * task.attempt } }
withLabel: midmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 25.GB * task.attempt } }
withLabel: highmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 50.GB * task.attempt } }
withLabel: veryhighmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 75.GB * task.attempt } }
// Disk space
withLabel: lowdisk {
disk = {process.disk ? process.disk : null}
}
withLabel: middisk {
disk = {process.disk ? process.disk : null}
}
withLabel: highdisk {
disk = {process.disk ? process.disk : null}
}
withLabel: veryhighdisk {
disk = {process.disk ? process.disk : null}
}
// NOTE: The above labels intentionally do not have an effect by default.
// The user should set the disk space requirements by adding the following
// to the compute environment:
//
// withLabel: lowdisk { disk = { 20.GB * task.attempt } }
// withLabel: middisk { disk = { 100.GB * task.attempt } }
// withLabel: highdisk { disk = { 200.GB * task.attempt } }
// withLabel: veryhighdisk { disk = { 500.GB * task.attempt } }
}

View File

@@ -0,0 +1,266 @@
name: "from_bdrhap_to_h5mu"
namespace: "convert"
version: "main"
authors:
- name: "Dorien Roosen"
roles:
- "author"
- "maintainer"
info:
role: "Core Team Member"
links:
email: "dorien@data-intuitive.com"
github: "dorien-er"
linkedin: "dorien-roosen"
organizations:
- name: "Data Intuitive"
href: "https://www.data-intuitive.com"
role: "Data Scientist"
- name: "Robrecht Cannoodt"
roles:
- "author"
info:
role: "Core Team Member"
links:
email: "robrecht@data-intuitive.com"
github: "rcannood"
orcid: "0000-0003-3641-729X"
linkedin: "robrechtcannoodt"
organizations:
- name: "Data Intuitive"
href: "https://www.data-intuitive.com"
role: "Data Science Engineer"
- name: "Open Problems"
href: "https://openproblems.bio"
role: "Core Member"
argument_groups:
- name: "Arguments"
arguments:
- type: "string"
name: "--output_compression"
description: "Compression format to use for the output AnnData and/or Mudata objects.\n\
By default no compression is applied.\n"
info: null
example:
- "gzip"
required: false
choices:
- "gzip"
- "lzf"
direction: "input"
multiple: false
multiple_sep: ";"
- name: "Inputs"
arguments:
- type: "string"
name: "--id"
description: "A sample ID."
info: null
example:
- "my_id"
required: true
direction: "input"
multiple: false
multiple_sep: ";"
- type: "file"
name: "--input"
alternatives:
- "-i"
description: "The output h5mu of a BD Rhapsody workflow."
info: null
example:
- "sample.h5mu"
must_exist: true
create_parent: true
required: true
direction: "input"
multiple: false
multiple_sep: ";"
- name: "Outputs"
arguments:
- type: "file"
name: "--output"
alternatives:
- "-o"
description: "Output h5mu file."
info: null
example:
- "output.h5mu"
must_exist: true
create_parent: true
required: true
direction: "output"
multiple: false
multiple_sep: ";"
resources:
- type: "python_script"
path: "script.py"
is_executable: true
- type: "file"
path: "nextflow_labels.config"
dest: "nextflow_labels.config"
description: "Convert the output of a BD Rhapsody pipeline v2.x to a MuData h5 file.\n"
test_resources:
- type: "python_script"
path: "test.py"
is_executable: true
- type: "file"
path: "sample.h5mu"
info: null
status: "enabled"
scope:
image: "public"
target: "public"
license: "MIT"
links:
repository: "https://github.com/openpipelines-bio/openpipeline"
docker_registry: "ghcr.io"
runners:
- type: "executable"
id: "executable"
docker_setup_strategy: "ifneedbepullelsecachedbuild"
- type: "nextflow"
id: "nextflow"
directives:
label:
- "lowmem"
- "singlecpu"
tag: "$id"
auto:
simplifyInput: true
simplifyOutput: false
transcript: false
publish: false
config:
labels:
mem1gb: "memory = 1000000000.B"
mem2gb: "memory = 2000000000.B"
mem5gb: "memory = 5000000000.B"
mem10gb: "memory = 10000000000.B"
mem20gb: "memory = 20000000000.B"
mem50gb: "memory = 50000000000.B"
mem100gb: "memory = 100000000000.B"
mem200gb: "memory = 200000000000.B"
mem500gb: "memory = 500000000000.B"
mem1tb: "memory = 1000000000000.B"
mem2tb: "memory = 2000000000000.B"
mem5tb: "memory = 5000000000000.B"
mem10tb: "memory = 10000000000000.B"
mem20tb: "memory = 20000000000000.B"
mem50tb: "memory = 50000000000000.B"
mem100tb: "memory = 100000000000000.B"
mem200tb: "memory = 200000000000000.B"
mem500tb: "memory = 500000000000000.B"
mem1gib: "memory = 1073741824.B"
mem2gib: "memory = 2147483648.B"
mem4gib: "memory = 4294967296.B"
mem8gib: "memory = 8589934592.B"
mem16gib: "memory = 17179869184.B"
mem32gib: "memory = 34359738368.B"
mem64gib: "memory = 68719476736.B"
mem128gib: "memory = 137438953472.B"
mem256gib: "memory = 274877906944.B"
mem512gib: "memory = 549755813888.B"
mem1tib: "memory = 1099511627776.B"
mem2tib: "memory = 2199023255552.B"
mem4tib: "memory = 4398046511104.B"
mem8tib: "memory = 8796093022208.B"
mem16tib: "memory = 17592186044416.B"
mem32tib: "memory = 35184372088832.B"
mem64tib: "memory = 70368744177664.B"
mem128tib: "memory = 140737488355328.B"
mem256tib: "memory = 281474976710656.B"
mem512tib: "memory = 562949953421312.B"
cpu1: "cpus = 1"
cpu2: "cpus = 2"
cpu5: "cpus = 5"
cpu10: "cpus = 10"
cpu20: "cpus = 20"
cpu50: "cpus = 50"
cpu100: "cpus = 100"
cpu200: "cpus = 200"
cpu500: "cpus = 500"
cpu1000: "cpus = 1000"
script:
- "includeConfig(\"nextflow_labels.config\")"
debug: false
container: "docker"
engines:
- type: "docker"
id: "docker"
image: "python:3.11-slim"
target_registry: "images.viash-hub.com"
target_tag: "main"
namespace_separator: "/"
setup:
- type: "apt"
packages:
- "procps"
interactive: false
- type: "python"
user: false
packages:
- "anndata~=0.11.1"
- "mudata~=0.3.1"
script:
- "exec(\"try:\\n import awkward\\nexcept ModuleNotFoundError:\\n exit(0)\\\
nelse: exit(1)\")"
upgrade: true
entrypoint: []
cmd: null
- type: "native"
id: "native"
build_info:
config: "src/convert/from_bdrhap_to_h5mu/config.vsh.yaml"
runner: "executable"
engine: "docker|native"
output: "target/executable/convert/from_bdrhap_to_h5mu"
executable: "target/executable/convert/from_bdrhap_to_h5mu/from_bdrhap_to_h5mu"
viash_version: "0.9.4"
git_commit: "173327cc5670aa8bd5cf473827de80b602c90092"
git_remote: "https://github.com/openpipelines-bio/openpipeline"
git_tag: "0.2.0-2055-g173327cc"
package_config:
name: "openpipeline"
version: "main"
summary: "Best-practice workflows for single-cell multi-omics analyses.\n"
description: "OpenPipelines are extensible single cell analysis pipelines for reproducible\
\ and large-scale single cell processing using [Viash](https://viash.io) and [Nextflow](https://www.nextflow.io/).\n\
\nIn terms of workflows, the following has been made available, but keep in mind\
\ that\nindividual tools and functionality can be executed as standalone components\
\ as well.\n\n * Demultiplexing: conversion of raw sequencing data to FASTQ objects.\n\
\ * Ingestion: Read mapping and generating a count matrix.\n * Single sample\
\ processing: cell filtering and doublet detection.\n * Multisample processing:\
\ Count transformation, normalization, QC metric calulations.\n * Integration:\
\ Clustering, integration and batch correction using single and multimodal methods.\n\
\ * Downstream analysis workflows\n"
info:
test_resources:
- type: "s3"
path: "s3://openpipelines-data"
dest: "resources_test"
nextflow_labels_ci:
- path: "src/workflows/utils/labels_ci.config"
description: "Adds the correct memory and CPU labels when running on the Viash\
\ Hub CI."
viash_version: "0.9.4"
source: "src"
target: "target"
config_mods:
- ".resources += {path: '/src/workflows/utils/labels.config', dest: 'nextflow_labels.config'}\n\
.runners[.type == 'nextflow'].config.script := 'includeConfig(\"nextflow_labels.config\"\
)'\n"
- ".engines += { type: \"native\" }"
- ".engines[.type == 'docker'].target_registry := 'images.viash-hub.com'"
- ".engines[.type == 'docker'].target_tag := 'main'"
keywords:
- "single-cell"
- "multimodal"
license: "MIT"
organization: "vsh"
links:
repository: "https://github.com/openpipelines-bio/openpipeline"
docker_registry: "ghcr.io"
homepage: "https://openpipelines.bio"
documentation: "https://openpipelines.bio/fundamentals"
issue_tracker: "https://github.com/openpipelines-bio/openpipeline/issues"

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,48 @@
process {
// Default resources for components that hardly do any processing
memory = { 2.GB * task.attempt }
cpus = 1
// Retry for exit codes that have something to do with memory issues
errorStrategy = { task.exitStatus in 137..140 ? 'retry' : 'terminate' }
maxRetries = 3
// The memory a task is assinged increases with each attempt
// uncomment the line below and adjust the value to set a global upper limit on the memory.
// resourceLimits = [ memory: 240.Gb ]
// CPU resources
withLabel: singlecpu { cpus = 1 }
withLabel: lowcpu { cpus = 4 }
withLabel: midcpu { cpus = 10 }
withLabel: highcpu { cpus = 20 }
// Memory resources
withLabel: lowmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 4.GB * task.attempt } }
withLabel: midmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 25.GB * task.attempt } }
withLabel: highmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 50.GB * task.attempt } }
withLabel: veryhighmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 75.GB * task.attempt } }
// Disk space
withLabel: lowdisk {
disk = {process.disk ? process.disk : null}
}
withLabel: middisk {
disk = {process.disk ? process.disk : null}
}
withLabel: highdisk {
disk = {process.disk ? process.disk : null}
}
withLabel: veryhighdisk {
disk = {process.disk ? process.disk : null}
}
// NOTE: The above labels intentionally do not have an effect by default.
// The user should set the disk space requirements by adding the following
// to the compute environment:
//
// withLabel: lowdisk { disk = { 20.GB * task.attempt } }
// withLabel: middisk { disk = { 100.GB * task.attempt } }
// withLabel: highdisk { disk = { 200.GB * task.attempt } }
// withLabel: veryhighdisk { disk = { 500.GB * task.attempt } }
}

View File

@@ -0,0 +1,284 @@
name: "from_cellranger_multi_to_h5mu"
namespace: "convert"
version: "main"
authors:
- name: "Dries Schaumont"
roles:
- "maintainer"
info:
role: "Core Team Member"
links:
email: "dries@data-intuitive.com"
github: "DriesSchaumont"
orcid: "0000-0002-4389-0440"
linkedin: "dries-schaumont"
organizations:
- name: "Data Intuitive"
href: "https://www.data-intuitive.com"
role: "Data Scientist"
argument_groups:
- name: "Arguments"
arguments:
- type: "file"
name: "--input"
alternatives:
- "-i"
description: "Input folder. Must contain the output from a cellranger multi run."
info: null
example:
- "input_dir_containing_modalities"
must_exist: true
create_parent: true
required: true
direction: "input"
multiple: false
multiple_sep: ";"
- type: "file"
name: "--output"
alternatives:
- "-o"
description: "Locations for the output files. Must contain a wildcard (*) character,\n\
which will be replaced with the sample name.\n"
info: null
example:
- "*.h5mu"
must_exist: true
create_parent: true
required: false
direction: "output"
multiple: true
multiple_sep: ";"
- type: "file"
name: "--sample_csv"
description: "CSV file describing the sample name per output file"
info: null
example:
- "samples.csv"
must_exist: true
create_parent: true
required: false
direction: "output"
multiple: false
multiple_sep: ";"
- type: "string"
name: "--uns_metrics"
description: "Name of the .uns slot under which to QC metrics (if any)."
info: null
default:
- "metrics_cellranger"
required: false
direction: "input"
multiple: false
multiple_sep: ";"
- type: "string"
name: "--output_compression"
description: "Compression format to use for the output AnnData and/or Mudata objects.\n\
By default no compression is applied.\n"
info: null
example:
- "gzip"
required: false
choices:
- "gzip"
- "lzf"
direction: "input"
multiple: false
multiple_sep: ";"
resources:
- type: "python_script"
path: "script.py"
is_executable: true
- type: "file"
path: "setup_logger.py"
- type: "file"
path: "nextflow_labels.config"
dest: "nextflow_labels.config"
description: "Converts the output from cellranger multi to a single .h5mu file.\n\
By default, will map the following library type names to modality names:\n - Gene\
\ Expression: rna\n - Peaks: atac\n - Antibody Capture: prot\n - VDJ: vdj\n \
\ - VDJ-T: vdj_t\n - VDJ-B: vdj_b\n - CRISPR Guide Capture: crispr\n - Multiplexing\
\ Capture: hashing\n \nOther library types have their whitepace removed and dashes\
\ replaced by\nunderscores to generate the modality name.\n\nCurrently does not\
\ allow parsing the output from cell barcode demultiplexing.\n"
test_resources:
- type: "python_script"
path: "test.py"
is_executable: true
- type: "file"
path: "10x_5k_anticmv"
- type: "file"
path: "10x_5k_lung_crispr"
- type: "file"
path: "10x_5k_beam"
- type: "file"
path: "10x_5k_fixed"
- type: "file"
path: "10x_4plex_dtc"
info: null
status: "enabled"
scope:
image: "public"
target: "public"
license: "MIT"
links:
repository: "https://github.com/openpipelines-bio/openpipeline"
docker_registry: "ghcr.io"
runners:
- type: "executable"
id: "executable"
docker_setup_strategy: "ifneedbepullelsecachedbuild"
- type: "nextflow"
id: "nextflow"
directives:
label:
- "lowmem"
- "singlecpu"
tag: "$id"
auto:
simplifyInput: true
simplifyOutput: false
transcript: false
publish: false
config:
labels:
mem1gb: "memory = 1000000000.B"
mem2gb: "memory = 2000000000.B"
mem5gb: "memory = 5000000000.B"
mem10gb: "memory = 10000000000.B"
mem20gb: "memory = 20000000000.B"
mem50gb: "memory = 50000000000.B"
mem100gb: "memory = 100000000000.B"
mem200gb: "memory = 200000000000.B"
mem500gb: "memory = 500000000000.B"
mem1tb: "memory = 1000000000000.B"
mem2tb: "memory = 2000000000000.B"
mem5tb: "memory = 5000000000000.B"
mem10tb: "memory = 10000000000000.B"
mem20tb: "memory = 20000000000000.B"
mem50tb: "memory = 50000000000000.B"
mem100tb: "memory = 100000000000000.B"
mem200tb: "memory = 200000000000000.B"
mem500tb: "memory = 500000000000000.B"
mem1gib: "memory = 1073741824.B"
mem2gib: "memory = 2147483648.B"
mem4gib: "memory = 4294967296.B"
mem8gib: "memory = 8589934592.B"
mem16gib: "memory = 17179869184.B"
mem32gib: "memory = 34359738368.B"
mem64gib: "memory = 68719476736.B"
mem128gib: "memory = 137438953472.B"
mem256gib: "memory = 274877906944.B"
mem512gib: "memory = 549755813888.B"
mem1tib: "memory = 1099511627776.B"
mem2tib: "memory = 2199023255552.B"
mem4tib: "memory = 4398046511104.B"
mem8tib: "memory = 8796093022208.B"
mem16tib: "memory = 17592186044416.B"
mem32tib: "memory = 35184372088832.B"
mem64tib: "memory = 70368744177664.B"
mem128tib: "memory = 140737488355328.B"
mem256tib: "memory = 281474976710656.B"
mem512tib: "memory = 562949953421312.B"
cpu1: "cpus = 1"
cpu2: "cpus = 2"
cpu5: "cpus = 5"
cpu10: "cpus = 10"
cpu20: "cpus = 20"
cpu50: "cpus = 50"
cpu100: "cpus = 100"
cpu200: "cpus = 200"
cpu500: "cpus = 500"
cpu1000: "cpus = 1000"
script:
- "includeConfig(\"nextflow_labels.config\")"
debug: false
container: "docker"
engines:
- type: "docker"
id: "docker"
image: "python:3.12-slim"
target_registry: "images.viash-hub.com"
target_tag: "main"
namespace_separator: "/"
setup:
- type: "apt"
packages:
- "procps"
interactive: false
- type: "python"
user: false
packages:
- "anndata~=0.11.1"
- "mudata~=0.3.1"
- "scanpy~=1.10.4"
- "scirpy~=0.12.0"
- "pandas~=2.2.3"
- "pytest"
script:
- "exec(\"try:\\n import awkward\\nexcept ModuleNotFoundError:\\n exit(0)\\\
nelse: exit(1)\")"
upgrade: true
test_setup:
- type: "python"
user: false
packages:
- "viashpy==0.8.0"
upgrade: true
entrypoint: []
cmd: null
- type: "native"
id: "native"
build_info:
config: "src/convert/from_cellranger_multi_to_h5mu/config.vsh.yaml"
runner: "executable"
engine: "docker|native"
output: "target/executable/convert/from_cellranger_multi_to_h5mu"
executable: "target/executable/convert/from_cellranger_multi_to_h5mu/from_cellranger_multi_to_h5mu"
viash_version: "0.9.4"
git_commit: "173327cc5670aa8bd5cf473827de80b602c90092"
git_remote: "https://github.com/openpipelines-bio/openpipeline"
git_tag: "0.2.0-2055-g173327cc"
package_config:
name: "openpipeline"
version: "main"
summary: "Best-practice workflows for single-cell multi-omics analyses.\n"
description: "OpenPipelines are extensible single cell analysis pipelines for reproducible\
\ and large-scale single cell processing using [Viash](https://viash.io) and [Nextflow](https://www.nextflow.io/).\n\
\nIn terms of workflows, the following has been made available, but keep in mind\
\ that\nindividual tools and functionality can be executed as standalone components\
\ as well.\n\n * Demultiplexing: conversion of raw sequencing data to FASTQ objects.\n\
\ * Ingestion: Read mapping and generating a count matrix.\n * Single sample\
\ processing: cell filtering and doublet detection.\n * Multisample processing:\
\ Count transformation, normalization, QC metric calulations.\n * Integration:\
\ Clustering, integration and batch correction using single and multimodal methods.\n\
\ * Downstream analysis workflows\n"
info:
test_resources:
- type: "s3"
path: "s3://openpipelines-data"
dest: "resources_test"
nextflow_labels_ci:
- path: "src/workflows/utils/labels_ci.config"
description: "Adds the correct memory and CPU labels when running on the Viash\
\ Hub CI."
viash_version: "0.9.4"
source: "src"
target: "target"
config_mods:
- ".resources += {path: '/src/workflows/utils/labels.config', dest: 'nextflow_labels.config'}\n\
.runners[.type == 'nextflow'].config.script := 'includeConfig(\"nextflow_labels.config\"\
)'\n"
- ".engines += { type: \"native\" }"
- ".engines[.type == 'docker'].target_registry := 'images.viash-hub.com'"
- ".engines[.type == 'docker'].target_tag := 'main'"
keywords:
- "single-cell"
- "multimodal"
license: "MIT"
organization: "vsh"
links:
repository: "https://github.com/openpipelines-bio/openpipeline"
docker_registry: "ghcr.io"
homepage: "https://openpipelines.bio"
documentation: "https://openpipelines.bio/fundamentals"
issue_tracker: "https://github.com/openpipelines-bio/openpipeline/issues"

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,48 @@
process {
// Default resources for components that hardly do any processing
memory = { 2.GB * task.attempt }
cpus = 1
// Retry for exit codes that have something to do with memory issues
errorStrategy = { task.exitStatus in 137..140 ? 'retry' : 'terminate' }
maxRetries = 3
// The memory a task is assinged increases with each attempt
// uncomment the line below and adjust the value to set a global upper limit on the memory.
// resourceLimits = [ memory: 240.Gb ]
// CPU resources
withLabel: singlecpu { cpus = 1 }
withLabel: lowcpu { cpus = 4 }
withLabel: midcpu { cpus = 10 }
withLabel: highcpu { cpus = 20 }
// Memory resources
withLabel: lowmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 4.GB * task.attempt } }
withLabel: midmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 25.GB * task.attempt } }
withLabel: highmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 50.GB * task.attempt } }
withLabel: veryhighmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 75.GB * task.attempt } }
// Disk space
withLabel: lowdisk {
disk = {process.disk ? process.disk : null}
}
withLabel: middisk {
disk = {process.disk ? process.disk : null}
}
withLabel: highdisk {
disk = {process.disk ? process.disk : null}
}
withLabel: veryhighdisk {
disk = {process.disk ? process.disk : null}
}
// NOTE: The above labels intentionally do not have an effect by default.
// The user should set the disk space requirements by adding the following
// to the compute environment:
//
// withLabel: lowdisk { disk = { 20.GB * task.attempt } }
// withLabel: middisk { disk = { 100.GB * task.attempt } }
// withLabel: highdisk { disk = { 200.GB * task.attempt } }
// withLabel: veryhighdisk { disk = { 500.GB * task.attempt } }
}

View File

@@ -0,0 +1,12 @@
def setup_logger():
import logging
from sys import stdout
logger = logging.getLogger()
logger.setLevel(logging.INFO)
console_handler = logging.StreamHandler(stdout)
logFormatter = logging.Formatter("%(asctime)s %(levelname)-8s %(message)s")
console_handler.setFormatter(logFormatter)
logger.addHandler(console_handler)
return logger

View File

@@ -0,0 +1,261 @@
name: "from_h5ad_to_h5mu"
namespace: "convert"
version: "main"
authors:
- name: "Dries De Maeyer"
roles:
- "maintainer"
info:
role: "Core Team Member"
links:
email: "ddemaeyer@gmail.com"
github: "ddemaeyer"
linkedin: "dries-de-maeyer-b46a814"
organizations:
- name: "Janssen Pharmaceuticals"
href: "https://www.janssen.com"
role: "Principal Scientist"
argument_groups:
- name: "Arguments"
arguments:
- type: "file"
name: "--input"
alternatives:
- "-i"
description: "Input h5ad files"
info: null
default:
- "input.h5ad"
must_exist: true
create_parent: true
required: true
direction: "input"
multiple: true
multiple_sep: ";"
- type: "string"
name: "--modality"
description: "List of names to use for the modalities. Will be used as the keys\
\ in the .mod attribute in the output MuData object\nThe number of items provided\
\ for this argument equal the number of input files (--input) and their order\
\ should match.\n"
info: null
default:
- "rna"
required: false
direction: "input"
multiple: true
multiple_sep: ";"
- type: "file"
name: "--output"
alternatives:
- "-o"
description: "Output MuData file."
info: null
default:
- "output.h5mu"
must_exist: true
create_parent: true
required: false
direction: "output"
multiple: false
multiple_sep: ";"
- type: "string"
name: "--output_compression"
description: "Compression format to use for the output AnnData and/or Mudata objects.\n\
By default no compression is applied.\n"
info: null
example:
- "gzip"
required: false
choices:
- "gzip"
- "lzf"
direction: "input"
multiple: false
multiple_sep: ";"
resources:
- type: "python_script"
path: "script.py"
is_executable: true
- type: "file"
path: "setup_logger.py"
- type: "file"
path: "nextflow_labels.config"
dest: "nextflow_labels.config"
description: "Converts a single layer h5ad file into a single MuData object\n"
test_resources:
- type: "python_script"
path: "test.py"
is_executable: true
- type: "file"
path: "pbmc_1k_protein_v3"
info: null
status: "enabled"
scope:
image: "public"
target: "public"
license: "MIT"
links:
repository: "https://github.com/openpipelines-bio/openpipeline"
docker_registry: "ghcr.io"
runners:
- type: "executable"
id: "executable"
docker_setup_strategy: "ifneedbepullelsecachedbuild"
- type: "nextflow"
id: "nextflow"
directives:
label:
- "lowmem"
- "singlecpu"
tag: "$id"
auto:
simplifyInput: true
simplifyOutput: false
transcript: false
publish: false
config:
labels:
mem1gb: "memory = 1000000000.B"
mem2gb: "memory = 2000000000.B"
mem5gb: "memory = 5000000000.B"
mem10gb: "memory = 10000000000.B"
mem20gb: "memory = 20000000000.B"
mem50gb: "memory = 50000000000.B"
mem100gb: "memory = 100000000000.B"
mem200gb: "memory = 200000000000.B"
mem500gb: "memory = 500000000000.B"
mem1tb: "memory = 1000000000000.B"
mem2tb: "memory = 2000000000000.B"
mem5tb: "memory = 5000000000000.B"
mem10tb: "memory = 10000000000000.B"
mem20tb: "memory = 20000000000000.B"
mem50tb: "memory = 50000000000000.B"
mem100tb: "memory = 100000000000000.B"
mem200tb: "memory = 200000000000000.B"
mem500tb: "memory = 500000000000000.B"
mem1gib: "memory = 1073741824.B"
mem2gib: "memory = 2147483648.B"
mem4gib: "memory = 4294967296.B"
mem8gib: "memory = 8589934592.B"
mem16gib: "memory = 17179869184.B"
mem32gib: "memory = 34359738368.B"
mem64gib: "memory = 68719476736.B"
mem128gib: "memory = 137438953472.B"
mem256gib: "memory = 274877906944.B"
mem512gib: "memory = 549755813888.B"
mem1tib: "memory = 1099511627776.B"
mem2tib: "memory = 2199023255552.B"
mem4tib: "memory = 4398046511104.B"
mem8tib: "memory = 8796093022208.B"
mem16tib: "memory = 17592186044416.B"
mem32tib: "memory = 35184372088832.B"
mem64tib: "memory = 70368744177664.B"
mem128tib: "memory = 140737488355328.B"
mem256tib: "memory = 281474976710656.B"
mem512tib: "memory = 562949953421312.B"
cpu1: "cpus = 1"
cpu2: "cpus = 2"
cpu5: "cpus = 5"
cpu10: "cpus = 10"
cpu20: "cpus = 20"
cpu50: "cpus = 50"
cpu100: "cpus = 100"
cpu200: "cpus = 200"
cpu500: "cpus = 500"
cpu1000: "cpus = 1000"
script:
- "includeConfig(\"nextflow_labels.config\")"
debug: false
container: "docker"
engines:
- type: "docker"
id: "docker"
image: "python:3.12-slim"
target_registry: "images.viash-hub.com"
target_tag: "main"
namespace_separator: "/"
setup:
- type: "apt"
packages:
- "procps"
interactive: false
- type: "python"
user: false
packages:
- "anndata~=0.11.1"
- "mudata~=0.3.1"
script:
- "exec(\"try:\\n import awkward\\nexcept ModuleNotFoundError:\\n exit(0)\\\
nelse: exit(1)\")"
upgrade: true
test_setup:
- type: "apt"
packages:
- "git"
interactive: false
- type: "python"
user: false
packages:
- "viashpy==0.8.0"
github:
- "openpipelines-bio/core#subdirectory=packages/python/openpipeline_testutils"
upgrade: true
entrypoint: []
cmd: null
- type: "native"
id: "native"
build_info:
config: "src/convert/from_h5ad_to_h5mu/config.vsh.yaml"
runner: "executable"
engine: "docker|native"
output: "target/executable/convert/from_h5ad_to_h5mu"
executable: "target/executable/convert/from_h5ad_to_h5mu/from_h5ad_to_h5mu"
viash_version: "0.9.4"
git_commit: "173327cc5670aa8bd5cf473827de80b602c90092"
git_remote: "https://github.com/openpipelines-bio/openpipeline"
git_tag: "0.2.0-2055-g173327cc"
package_config:
name: "openpipeline"
version: "main"
summary: "Best-practice workflows for single-cell multi-omics analyses.\n"
description: "OpenPipelines are extensible single cell analysis pipelines for reproducible\
\ and large-scale single cell processing using [Viash](https://viash.io) and [Nextflow](https://www.nextflow.io/).\n\
\nIn terms of workflows, the following has been made available, but keep in mind\
\ that\nindividual tools and functionality can be executed as standalone components\
\ as well.\n\n * Demultiplexing: conversion of raw sequencing data to FASTQ objects.\n\
\ * Ingestion: Read mapping and generating a count matrix.\n * Single sample\
\ processing: cell filtering and doublet detection.\n * Multisample processing:\
\ Count transformation, normalization, QC metric calulations.\n * Integration:\
\ Clustering, integration and batch correction using single and multimodal methods.\n\
\ * Downstream analysis workflows\n"
info:
test_resources:
- type: "s3"
path: "s3://openpipelines-data"
dest: "resources_test"
nextflow_labels_ci:
- path: "src/workflows/utils/labels_ci.config"
description: "Adds the correct memory and CPU labels when running on the Viash\
\ Hub CI."
viash_version: "0.9.4"
source: "src"
target: "target"
config_mods:
- ".resources += {path: '/src/workflows/utils/labels.config', dest: 'nextflow_labels.config'}\n\
.runners[.type == 'nextflow'].config.script := 'includeConfig(\"nextflow_labels.config\"\
)'\n"
- ".engines += { type: \"native\" }"
- ".engines[.type == 'docker'].target_registry := 'images.viash-hub.com'"
- ".engines[.type == 'docker'].target_tag := 'main'"
keywords:
- "single-cell"
- "multimodal"
license: "MIT"
organization: "vsh"
links:
repository: "https://github.com/openpipelines-bio/openpipeline"
docker_registry: "ghcr.io"
homepage: "https://openpipelines.bio"
documentation: "https://openpipelines.bio/fundamentals"
issue_tracker: "https://github.com/openpipelines-bio/openpipeline/issues"

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,48 @@
process {
// Default resources for components that hardly do any processing
memory = { 2.GB * task.attempt }
cpus = 1
// Retry for exit codes that have something to do with memory issues
errorStrategy = { task.exitStatus in 137..140 ? 'retry' : 'terminate' }
maxRetries = 3
// The memory a task is assinged increases with each attempt
// uncomment the line below and adjust the value to set a global upper limit on the memory.
// resourceLimits = [ memory: 240.Gb ]
// CPU resources
withLabel: singlecpu { cpus = 1 }
withLabel: lowcpu { cpus = 4 }
withLabel: midcpu { cpus = 10 }
withLabel: highcpu { cpus = 20 }
// Memory resources
withLabel: lowmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 4.GB * task.attempt } }
withLabel: midmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 25.GB * task.attempt } }
withLabel: highmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 50.GB * task.attempt } }
withLabel: veryhighmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 75.GB * task.attempt } }
// Disk space
withLabel: lowdisk {
disk = {process.disk ? process.disk : null}
}
withLabel: middisk {
disk = {process.disk ? process.disk : null}
}
withLabel: highdisk {
disk = {process.disk ? process.disk : null}
}
withLabel: veryhighdisk {
disk = {process.disk ? process.disk : null}
}
// NOTE: The above labels intentionally do not have an effect by default.
// The user should set the disk space requirements by adding the following
// to the compute environment:
//
// withLabel: lowdisk { disk = { 20.GB * task.attempt } }
// withLabel: middisk { disk = { 100.GB * task.attempt } }
// withLabel: highdisk { disk = { 200.GB * task.attempt } }
// withLabel: veryhighdisk { disk = { 500.GB * task.attempt } }
}

View File

@@ -0,0 +1,12 @@
def setup_logger():
import logging
from sys import stdout
logger = logging.getLogger()
logger.setLevel(logging.INFO)
console_handler = logging.StreamHandler(stdout)
logFormatter = logging.Formatter("%(asctime)s %(levelname)-8s %(message)s")
console_handler.setFormatter(logFormatter)
logger.addHandler(console_handler)
return logger

View File

@@ -0,0 +1,242 @@
name: "from_h5ad_to_seurat"
namespace: "convert"
version: "main"
authors:
- name: "Robrecht Cannoodt"
roles:
- "author"
- "maintainer"
info:
role: "Core Team Member"
links:
email: "robrecht@data-intuitive.com"
github: "rcannood"
orcid: "0000-0003-3641-729X"
linkedin: "robrechtcannoodt"
organizations:
- name: "Data Intuitive"
href: "https://www.data-intuitive.com"
role: "Data Science Engineer"
- name: "Open Problems"
href: "https://openproblems.bio"
role: "Core Member"
argument_groups:
- name: "Arguments"
arguments:
- type: "file"
name: "--input"
alternatives:
- "-i"
description: "Input h5ad file"
info: null
example:
- "input.h5ad"
must_exist: true
create_parent: true
required: true
direction: "input"
multiple: false
multiple_sep: ";"
- type: "string"
name: "--assay"
description: "Name of the assay to be created."
info: null
default:
- "RNA"
required: false
direction: "input"
multiple: false
multiple_sep: ";"
- type: "file"
name: "--output"
alternatives:
- "-o"
description: "Output Seurat file"
info: null
example:
- "output.rds"
must_exist: true
create_parent: true
required: true
direction: "output"
multiple: false
multiple_sep: ";"
resources:
- type: "r_script"
path: "script.R"
is_executable: true
- type: "file"
path: "nextflow_labels.config"
dest: "nextflow_labels.config"
description: "Converts an h5ad file into a Seurat file.\n"
test_resources:
- type: "r_script"
path: "test.R"
is_executable: true
- type: "file"
path: "pbmc_1k_protein_v3_filtered_feature_bc_matrix_rna.h5ad"
info: null
status: "enabled"
scope:
image: "public"
target: "public"
license: "MIT"
links:
repository: "https://github.com/openpipelines-bio/openpipeline"
docker_registry: "ghcr.io"
runners:
- type: "executable"
id: "executable"
docker_setup_strategy: "ifneedbepullelsecachedbuild"
- type: "nextflow"
id: "nextflow"
directives:
label:
- "lowmem"
- "singlecpu"
tag: "$id"
auto:
simplifyInput: true
simplifyOutput: false
transcript: false
publish: false
config:
labels:
mem1gb: "memory = 1000000000.B"
mem2gb: "memory = 2000000000.B"
mem5gb: "memory = 5000000000.B"
mem10gb: "memory = 10000000000.B"
mem20gb: "memory = 20000000000.B"
mem50gb: "memory = 50000000000.B"
mem100gb: "memory = 100000000000.B"
mem200gb: "memory = 200000000000.B"
mem500gb: "memory = 500000000000.B"
mem1tb: "memory = 1000000000000.B"
mem2tb: "memory = 2000000000000.B"
mem5tb: "memory = 5000000000000.B"
mem10tb: "memory = 10000000000000.B"
mem20tb: "memory = 20000000000000.B"
mem50tb: "memory = 50000000000000.B"
mem100tb: "memory = 100000000000000.B"
mem200tb: "memory = 200000000000000.B"
mem500tb: "memory = 500000000000000.B"
mem1gib: "memory = 1073741824.B"
mem2gib: "memory = 2147483648.B"
mem4gib: "memory = 4294967296.B"
mem8gib: "memory = 8589934592.B"
mem16gib: "memory = 17179869184.B"
mem32gib: "memory = 34359738368.B"
mem64gib: "memory = 68719476736.B"
mem128gib: "memory = 137438953472.B"
mem256gib: "memory = 274877906944.B"
mem512gib: "memory = 549755813888.B"
mem1tib: "memory = 1099511627776.B"
mem2tib: "memory = 2199023255552.B"
mem4tib: "memory = 4398046511104.B"
mem8tib: "memory = 8796093022208.B"
mem16tib: "memory = 17592186044416.B"
mem32tib: "memory = 35184372088832.B"
mem64tib: "memory = 70368744177664.B"
mem128tib: "memory = 140737488355328.B"
mem256tib: "memory = 281474976710656.B"
mem512tib: "memory = 562949953421312.B"
cpu1: "cpus = 1"
cpu2: "cpus = 2"
cpu5: "cpus = 5"
cpu10: "cpus = 10"
cpu20: "cpus = 20"
cpu50: "cpus = 50"
cpu100: "cpus = 100"
cpu200: "cpus = 200"
cpu500: "cpus = 500"
cpu1000: "cpus = 1000"
script:
- "includeConfig(\"nextflow_labels.config\")"
debug: false
container: "docker"
engines:
- type: "docker"
id: "docker"
image: "rocker/r2u:24.04"
target_registry: "images.viash-hub.com"
target_tag: "main"
namespace_separator: "/"
setup:
- type: "apt"
packages:
- "libhdf5-dev"
- "libgeos-dev"
interactive: false
- type: "r"
cran:
- "hdf5r"
- "Seurat"
- "SeuratObject"
github:
- "scverse/anndataR@36f3caad9a7f360165c1510bbe0c62657580415a"
bioc_force_install: false
warnings_as_errors: true
test_setup:
- type: "r"
cran:
- "testthat"
bioc_force_install: false
warnings_as_errors: true
entrypoint: []
cmd: null
- type: "native"
id: "native"
build_info:
config: "src/convert/from_h5ad_to_seurat/config.vsh.yaml"
runner: "executable"
engine: "docker|native"
output: "target/executable/convert/from_h5ad_to_seurat"
executable: "target/executable/convert/from_h5ad_to_seurat/from_h5ad_to_seurat"
viash_version: "0.9.4"
git_commit: "173327cc5670aa8bd5cf473827de80b602c90092"
git_remote: "https://github.com/openpipelines-bio/openpipeline"
git_tag: "0.2.0-2055-g173327cc"
package_config:
name: "openpipeline"
version: "main"
summary: "Best-practice workflows for single-cell multi-omics analyses.\n"
description: "OpenPipelines are extensible single cell analysis pipelines for reproducible\
\ and large-scale single cell processing using [Viash](https://viash.io) and [Nextflow](https://www.nextflow.io/).\n\
\nIn terms of workflows, the following has been made available, but keep in mind\
\ that\nindividual tools and functionality can be executed as standalone components\
\ as well.\n\n * Demultiplexing: conversion of raw sequencing data to FASTQ objects.\n\
\ * Ingestion: Read mapping and generating a count matrix.\n * Single sample\
\ processing: cell filtering and doublet detection.\n * Multisample processing:\
\ Count transformation, normalization, QC metric calulations.\n * Integration:\
\ Clustering, integration and batch correction using single and multimodal methods.\n\
\ * Downstream analysis workflows\n"
info:
test_resources:
- type: "s3"
path: "s3://openpipelines-data"
dest: "resources_test"
nextflow_labels_ci:
- path: "src/workflows/utils/labels_ci.config"
description: "Adds the correct memory and CPU labels when running on the Viash\
\ Hub CI."
viash_version: "0.9.4"
source: "src"
target: "target"
config_mods:
- ".resources += {path: '/src/workflows/utils/labels.config', dest: 'nextflow_labels.config'}\n\
.runners[.type == 'nextflow'].config.script := 'includeConfig(\"nextflow_labels.config\"\
)'\n"
- ".engines += { type: \"native\" }"
- ".engines[.type == 'docker'].target_registry := 'images.viash-hub.com'"
- ".engines[.type == 'docker'].target_tag := 'main'"
keywords:
- "single-cell"
- "multimodal"
license: "MIT"
organization: "vsh"
links:
repository: "https://github.com/openpipelines-bio/openpipeline"
docker_registry: "ghcr.io"
homepage: "https://openpipelines.bio"
documentation: "https://openpipelines.bio/fundamentals"
issue_tracker: "https://github.com/openpipelines-bio/openpipeline/issues"

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,48 @@
process {
// Default resources for components that hardly do any processing
memory = { 2.GB * task.attempt }
cpus = 1
// Retry for exit codes that have something to do with memory issues
errorStrategy = { task.exitStatus in 137..140 ? 'retry' : 'terminate' }
maxRetries = 3
// The memory a task is assinged increases with each attempt
// uncomment the line below and adjust the value to set a global upper limit on the memory.
// resourceLimits = [ memory: 240.Gb ]
// CPU resources
withLabel: singlecpu { cpus = 1 }
withLabel: lowcpu { cpus = 4 }
withLabel: midcpu { cpus = 10 }
withLabel: highcpu { cpus = 20 }
// Memory resources
withLabel: lowmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 4.GB * task.attempt } }
withLabel: midmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 25.GB * task.attempt } }
withLabel: highmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 50.GB * task.attempt } }
withLabel: veryhighmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 75.GB * task.attempt } }
// Disk space
withLabel: lowdisk {
disk = {process.disk ? process.disk : null}
}
withLabel: middisk {
disk = {process.disk ? process.disk : null}
}
withLabel: highdisk {
disk = {process.disk ? process.disk : null}
}
withLabel: veryhighdisk {
disk = {process.disk ? process.disk : null}
}
// NOTE: The above labels intentionally do not have an effect by default.
// The user should set the disk space requirements by adding the following
// to the compute environment:
//
// withLabel: lowdisk { disk = { 20.GB * task.attempt } }
// withLabel: middisk { disk = { 100.GB * task.attempt } }
// withLabel: highdisk { disk = { 200.GB * task.attempt } }
// withLabel: veryhighdisk { disk = { 500.GB * task.attempt } }
}

View File

@@ -0,0 +1,251 @@
name: "from_h5mu_or_h5ad_to_seurat"
namespace: "convert"
version: "main"
authors:
- name: "Dorien Roosen"
roles:
- "author"
- "maintainer"
info:
role: "Core Team Member"
links:
email: "dorien@data-intuitive.com"
github: "dorien-er"
linkedin: "dorien-roosen"
organizations:
- name: "Data Intuitive"
href: "https://www.data-intuitive.com"
role: "Data Scientist"
argument_groups:
- name: "Arguments"
arguments:
- type: "file"
name: "--input"
alternatives:
- "-i"
description: "Input h5ad or h5mu file"
info: null
example:
- "input.h5ad"
must_exist: true
create_parent: true
required: true
direction: "input"
multiple: false
multiple_sep: ";"
- type: "string"
name: "--modality"
description: "Modality to be converted if the input file is an h5mu file."
info: null
default:
- "rna"
required: false
direction: "input"
multiple: false
multiple_sep: ";"
- type: "string"
name: "--assay"
description: "Name of the assay to be created."
info: null
default:
- "RNA"
required: false
direction: "input"
multiple: false
multiple_sep: ";"
- type: "file"
name: "--output"
alternatives:
- "-o"
description: "Output Seurat file"
info: null
example:
- "output.rds"
must_exist: true
create_parent: true
required: true
direction: "output"
multiple: false
multiple_sep: ";"
resources:
- type: "r_script"
path: "script.R"
is_executable: true
- type: "file"
path: "nextflow_labels.config"
dest: "nextflow_labels.config"
description: "Converts an h5ad file or a single modality of an h5mu file into a Seurat\
\ file.\n"
test_resources:
- type: "r_script"
path: "test.R"
is_executable: true
- type: "file"
path: "pbmc_1k_protein_v3"
info: null
status: "enabled"
scope:
image: "public"
target: "public"
license: "MIT"
links:
repository: "https://github.com/openpipelines-bio/openpipeline"
docker_registry: "ghcr.io"
runners:
- type: "executable"
id: "executable"
docker_setup_strategy: "ifneedbepullelsecachedbuild"
- type: "nextflow"
id: "nextflow"
directives:
label:
- "lowmem"
- "singlecpu"
tag: "$id"
auto:
simplifyInput: true
simplifyOutput: false
transcript: false
publish: false
config:
labels:
mem1gb: "memory = 1000000000.B"
mem2gb: "memory = 2000000000.B"
mem5gb: "memory = 5000000000.B"
mem10gb: "memory = 10000000000.B"
mem20gb: "memory = 20000000000.B"
mem50gb: "memory = 50000000000.B"
mem100gb: "memory = 100000000000.B"
mem200gb: "memory = 200000000000.B"
mem500gb: "memory = 500000000000.B"
mem1tb: "memory = 1000000000000.B"
mem2tb: "memory = 2000000000000.B"
mem5tb: "memory = 5000000000000.B"
mem10tb: "memory = 10000000000000.B"
mem20tb: "memory = 20000000000000.B"
mem50tb: "memory = 50000000000000.B"
mem100tb: "memory = 100000000000000.B"
mem200tb: "memory = 200000000000000.B"
mem500tb: "memory = 500000000000000.B"
mem1gib: "memory = 1073741824.B"
mem2gib: "memory = 2147483648.B"
mem4gib: "memory = 4294967296.B"
mem8gib: "memory = 8589934592.B"
mem16gib: "memory = 17179869184.B"
mem32gib: "memory = 34359738368.B"
mem64gib: "memory = 68719476736.B"
mem128gib: "memory = 137438953472.B"
mem256gib: "memory = 274877906944.B"
mem512gib: "memory = 549755813888.B"
mem1tib: "memory = 1099511627776.B"
mem2tib: "memory = 2199023255552.B"
mem4tib: "memory = 4398046511104.B"
mem8tib: "memory = 8796093022208.B"
mem16tib: "memory = 17592186044416.B"
mem32tib: "memory = 35184372088832.B"
mem64tib: "memory = 70368744177664.B"
mem128tib: "memory = 140737488355328.B"
mem256tib: "memory = 281474976710656.B"
mem512tib: "memory = 562949953421312.B"
cpu1: "cpus = 1"
cpu2: "cpus = 2"
cpu5: "cpus = 5"
cpu10: "cpus = 10"
cpu20: "cpus = 20"
cpu50: "cpus = 50"
cpu100: "cpus = 100"
cpu200: "cpus = 200"
cpu500: "cpus = 500"
cpu1000: "cpus = 1000"
script:
- "includeConfig(\"nextflow_labels.config\")"
debug: false
container: "docker"
engines:
- type: "docker"
id: "docker"
image: "rocker/r2u:22.04"
target_registry: "images.viash-hub.com"
target_tag: "main"
namespace_separator: "/"
setup:
- type: "apt"
packages:
- "libhdf5-dev"
- "libgeos-dev"
- "hdf5-tools"
interactive: false
- type: "r"
cran:
- "anndata"
- "hdf5r"
- "Seurat"
- "SeuratObject"
github:
- "scverse/anndataR@36f3caad9a7f360165c1510bbe0c62657580415a"
bioc_force_install: false
warnings_as_errors: true
test_setup:
- type: "r"
cran:
- "testthat"
bioc_force_install: false
warnings_as_errors: true
entrypoint: []
cmd: null
- type: "native"
id: "native"
build_info:
config: "src/convert/from_h5mu_or_h5ad_to_seurat/config.vsh.yaml"
runner: "executable"
engine: "docker|native"
output: "target/executable/convert/from_h5mu_or_h5ad_to_seurat"
executable: "target/executable/convert/from_h5mu_or_h5ad_to_seurat/from_h5mu_or_h5ad_to_seurat"
viash_version: "0.9.4"
git_commit: "173327cc5670aa8bd5cf473827de80b602c90092"
git_remote: "https://github.com/openpipelines-bio/openpipeline"
git_tag: "0.2.0-2055-g173327cc"
package_config:
name: "openpipeline"
version: "main"
summary: "Best-practice workflows for single-cell multi-omics analyses.\n"
description: "OpenPipelines are extensible single cell analysis pipelines for reproducible\
\ and large-scale single cell processing using [Viash](https://viash.io) and [Nextflow](https://www.nextflow.io/).\n\
\nIn terms of workflows, the following has been made available, but keep in mind\
\ that\nindividual tools and functionality can be executed as standalone components\
\ as well.\n\n * Demultiplexing: conversion of raw sequencing data to FASTQ objects.\n\
\ * Ingestion: Read mapping and generating a count matrix.\n * Single sample\
\ processing: cell filtering and doublet detection.\n * Multisample processing:\
\ Count transformation, normalization, QC metric calulations.\n * Integration:\
\ Clustering, integration and batch correction using single and multimodal methods.\n\
\ * Downstream analysis workflows\n"
info:
test_resources:
- type: "s3"
path: "s3://openpipelines-data"
dest: "resources_test"
nextflow_labels_ci:
- path: "src/workflows/utils/labels_ci.config"
description: "Adds the correct memory and CPU labels when running on the Viash\
\ Hub CI."
viash_version: "0.9.4"
source: "src"
target: "target"
config_mods:
- ".resources += {path: '/src/workflows/utils/labels.config', dest: 'nextflow_labels.config'}\n\
.runners[.type == 'nextflow'].config.script := 'includeConfig(\"nextflow_labels.config\"\
)'\n"
- ".engines += { type: \"native\" }"
- ".engines[.type == 'docker'].target_registry := 'images.viash-hub.com'"
- ".engines[.type == 'docker'].target_tag := 'main'"
keywords:
- "single-cell"
- "multimodal"
license: "MIT"
organization: "vsh"
links:
repository: "https://github.com/openpipelines-bio/openpipeline"
docker_registry: "ghcr.io"
homepage: "https://openpipelines.bio"
documentation: "https://openpipelines.bio/fundamentals"
issue_tracker: "https://github.com/openpipelines-bio/openpipeline/issues"

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,48 @@
process {
// Default resources for components that hardly do any processing
memory = { 2.GB * task.attempt }
cpus = 1
// Retry for exit codes that have something to do with memory issues
errorStrategy = { task.exitStatus in 137..140 ? 'retry' : 'terminate' }
maxRetries = 3
// The memory a task is assinged increases with each attempt
// uncomment the line below and adjust the value to set a global upper limit on the memory.
// resourceLimits = [ memory: 240.Gb ]
// CPU resources
withLabel: singlecpu { cpus = 1 }
withLabel: lowcpu { cpus = 4 }
withLabel: midcpu { cpus = 10 }
withLabel: highcpu { cpus = 20 }
// Memory resources
withLabel: lowmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 4.GB * task.attempt } }
withLabel: midmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 25.GB * task.attempt } }
withLabel: highmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 50.GB * task.attempt } }
withLabel: veryhighmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 75.GB * task.attempt } }
// Disk space
withLabel: lowdisk {
disk = {process.disk ? process.disk : null}
}
withLabel: middisk {
disk = {process.disk ? process.disk : null}
}
withLabel: highdisk {
disk = {process.disk ? process.disk : null}
}
withLabel: veryhighdisk {
disk = {process.disk ? process.disk : null}
}
// NOTE: The above labels intentionally do not have an effect by default.
// The user should set the disk space requirements by adding the following
// to the compute environment:
//
// withLabel: lowdisk { disk = { 20.GB * task.attempt } }
// withLabel: middisk { disk = { 100.GB * task.attempt } }
// withLabel: highdisk { disk = { 200.GB * task.attempt } }
// withLabel: veryhighdisk { disk = { 500.GB * task.attempt } }
}

View File

@@ -0,0 +1,428 @@
name: "from_h5mu_or_h5ad_to_tiledb"
namespace: "convert"
version: "main"
authors:
- name: "Dries Schaumont"
roles:
- "author"
- "maintainer"
info:
role: "Core Team Member"
links:
email: "dries@data-intuitive.com"
github: "DriesSchaumont"
orcid: "0000-0002-4389-0440"
linkedin: "dries-schaumont"
organizations:
- name: "Data Intuitive"
href: "https://www.data-intuitive.com"
role: "Data Scientist"
argument_groups:
- name: "Input"
arguments:
- type: "file"
name: "--input"
description: "Input AnnData or MuData file. When an AnnData file is provided,\
\ it is automatically assumed to \ncontain transcriptome counts.\n"
info: null
example:
- "input.h5mu"
must_exist: true
create_parent: true
required: true
direction: "input"
multiple: false
multiple_sep: ";"
- name: "RNA modality"
arguments:
- type: "string"
name: "--rna_modality"
description: "The name used for the RNA modality. Used when input file is a MuData\
\ object.\n"
info: null
default:
- "rna"
required: false
direction: "input"
multiple: false
multiple_sep: ";"
- type: "string"
name: "--rna_raw_layer_input"
description: "Location of the layer containing the raw transcriptome counts. Layers\
\ are looked for in .layers,\nexcept when using the value 'X'; in which case\
\ .X is used.\n"
info: null
example:
- "X"
required: true
direction: "input"
multiple: false
multiple_sep: ";"
- type: "string"
name: "--rna_normalized_layer_input"
description: "Location of the layer containing the normalized counts. Layers are\
\ looked for in .layers,\nexcept when using the value 'X'; in which case .X\
\ is used.\n"
info: null
example:
- "log_normalized"
required: true
direction: "input"
multiple: false
multiple_sep: ";"
- type: "string"
name: "--rna_var_gene_names_input"
description: "Column in .var that provides the gene names. If not specified, the\
\ index from the input is used.\n"
info: null
example:
- "gene_symbol"
required: false
direction: "input"
multiple: false
multiple_sep: ";"
- name: "Protein modality"
arguments:
- type: "string"
name: "--prot_modality"
description: "The name used for the protein modality. Used when input file is\
\ a MuData object.\nWhen not specified, the protein modality will not be processed.\n"
info: null
example:
- "prot"
required: false
direction: "input"
multiple: false
multiple_sep: ";"
- type: "string"
name: "--prot_raw_layer_input"
description: "Location of the layer containing the raw protein counts. Layers\
\ are looked for in .layers,\nexcept when using the value 'X'; in which case\
\ .X is used.\n"
info: null
example:
- "X"
required: false
direction: "input"
multiple: false
multiple_sep: ";"
- type: "string"
name: "--prot_normalized_layer_input"
description: "Location of the layer containing the normalized counts. Layers are\
\ looked for in .layers,\nexcept when using the value 'X'; in which case .X\
\ is used.\n"
info: null
example:
- "clr"
required: false
direction: "input"
multiple: false
multiple_sep: ";"
- name: "Output slots"
arguments:
- type: "string"
name: "--rna_modality_output"
description: "TileDB Measurement name where the RNA modality will be stored.\n"
info: null
default:
- "rna"
required: false
direction: "input"
multiple: false
multiple_sep: ";"
- type: "string"
name: "--prot_modality_output"
description: "Name of the TileDB Measurement where the protein modality will be\
\ stored.\n"
info: null
default:
- "prot"
required: false
direction: "input"
multiple: false
multiple_sep: ";"
- type: "string"
name: "--obs_index_name_output"
description: "Name of the index that is used to describe the cells (observations).\n"
info: null
default:
- "cell_id"
required: false
direction: "input"
multiple: false
multiple_sep: ";"
- type: "string"
name: "--rna_var_index_name_output"
description: "Output name of the index that is used to describe the genes.\n"
info: null
default:
- "rna_index"
required: false
direction: "input"
multiple: false
multiple_sep: ";"
- type: "string"
name: "--rna_raw_layer_output"
description: "Output location for the raw transcriptomics counts.\n"
info: null
default:
- "X"
required: false
direction: "input"
multiple: false
multiple_sep: ";"
- type: "string"
name: "--rna_normalized_layer_output"
description: "Output location for the normalized RNA counts.\n"
info: null
default:
- "log_normalized"
required: false
direction: "input"
multiple: false
multiple_sep: ";"
- type: "string"
name: "--rna_var_gene_names_output"
description: "Name of the .var column that specifies the gene games.\n"
info: null
default:
- "gene_symbol"
required: false
direction: "input"
multiple: false
multiple_sep: ";"
- type: "string"
name: "--prot_var_index_name_output"
description: "Output name of the index that is used to describe the proteins.\
\ \n"
info: null
default:
- "prot_index"
required: false
direction: "input"
multiple: false
multiple_sep: ";"
- type: "string"
name: "--prot_raw_layer_output"
description: "Output location for the raw protein counts.\n"
info: null
default:
- "X"
required: false
direction: "input"
multiple: false
multiple_sep: ";"
- type: "string"
name: "--prot_normalized_layer_output"
description: "Output location for the normalized protein counts.\n"
info: null
default:
- "log_normalized"
required: false
direction: "input"
multiple: false
multiple_sep: ";"
- name: "Output arguments"
arguments:
- type: "file"
name: "--tiledb_dir"
description: "Directory where the TileDB output will be written to.\n"
info: null
must_exist: true
create_parent: true
required: false
direction: "output"
multiple: false
multiple_sep: ";"
resources:
- type: "python_script"
path: "script.py"
is_executable: true
- type: "file"
path: "setup_logger.py"
- type: "file"
path: "nextflow_labels.config"
dest: "nextflow_labels.config"
description: "Convert a MuData or AnnData object to tiledb. Currently, transcriptome\
\ and protein modalities are supported.\n\nNOTE: The functionality provided by this\
\ component is experimental and may be subject to change. \n"
test_resources:
- type: "python_script"
path: "test.py"
is_executable: true
info: null
status: "enabled"
scope:
image: "public"
target: "public"
license: "MIT"
links:
repository: "https://github.com/openpipelines-bio/openpipeline"
docker_registry: "ghcr.io"
runners:
- type: "executable"
id: "executable"
docker_setup_strategy: "ifneedbepullelsecachedbuild"
- type: "nextflow"
id: "nextflow"
directives:
label:
- "midmem"
- "midcpu"
tag: "$id"
auto:
simplifyInput: true
simplifyOutput: false
transcript: false
publish: false
config:
labels:
mem1gb: "memory = 1000000000.B"
mem2gb: "memory = 2000000000.B"
mem5gb: "memory = 5000000000.B"
mem10gb: "memory = 10000000000.B"
mem20gb: "memory = 20000000000.B"
mem50gb: "memory = 50000000000.B"
mem100gb: "memory = 100000000000.B"
mem200gb: "memory = 200000000000.B"
mem500gb: "memory = 500000000000.B"
mem1tb: "memory = 1000000000000.B"
mem2tb: "memory = 2000000000000.B"
mem5tb: "memory = 5000000000000.B"
mem10tb: "memory = 10000000000000.B"
mem20tb: "memory = 20000000000000.B"
mem50tb: "memory = 50000000000000.B"
mem100tb: "memory = 100000000000000.B"
mem200tb: "memory = 200000000000000.B"
mem500tb: "memory = 500000000000000.B"
mem1gib: "memory = 1073741824.B"
mem2gib: "memory = 2147483648.B"
mem4gib: "memory = 4294967296.B"
mem8gib: "memory = 8589934592.B"
mem16gib: "memory = 17179869184.B"
mem32gib: "memory = 34359738368.B"
mem64gib: "memory = 68719476736.B"
mem128gib: "memory = 137438953472.B"
mem256gib: "memory = 274877906944.B"
mem512gib: "memory = 549755813888.B"
mem1tib: "memory = 1099511627776.B"
mem2tib: "memory = 2199023255552.B"
mem4tib: "memory = 4398046511104.B"
mem8tib: "memory = 8796093022208.B"
mem16tib: "memory = 17592186044416.B"
mem32tib: "memory = 35184372088832.B"
mem64tib: "memory = 70368744177664.B"
mem128tib: "memory = 140737488355328.B"
mem256tib: "memory = 281474976710656.B"
mem512tib: "memory = 562949953421312.B"
cpu1: "cpus = 1"
cpu2: "cpus = 2"
cpu5: "cpus = 5"
cpu10: "cpus = 10"
cpu20: "cpus = 20"
cpu50: "cpus = 50"
cpu100: "cpus = 100"
cpu200: "cpus = 200"
cpu500: "cpus = 500"
cpu1000: "cpus = 1000"
script:
- "includeConfig(\"nextflow_labels.config\")"
debug: false
container: "docker"
engines:
- type: "docker"
id: "docker"
image: "python:3.12-slim"
target_registry: "images.viash-hub.com"
target_tag: "main"
namespace_separator: "/"
setup:
- type: "apt"
packages:
- "procps"
interactive: false
- type: "python"
user: false
packages:
- "anndata~=0.11.1"
- "mudata~=0.3.1"
- "tiledbsoma"
script:
- "exec(\"try:\\n import awkward\\nexcept ModuleNotFoundError:\\n exit(0)\\\
nelse: exit(1)\")"
upgrade: true
test_setup:
- type: "apt"
packages:
- "git"
interactive: false
- type: "python"
user: false
packages:
- "viashpy==0.8.0"
github:
- "openpipelines-bio/core#subdirectory=packages/python/openpipeline_testutils"
upgrade: true
- type: "python"
user: false
packages:
- "viashpy==0.8.0"
upgrade: true
entrypoint: []
cmd: null
- type: "native"
id: "native"
build_info:
config: "src/convert/from_h5mu_or_h5ad_to_tiledb/config.vsh.yaml"
runner: "executable"
engine: "docker|native"
output: "target/executable/convert/from_h5mu_or_h5ad_to_tiledb"
executable: "target/executable/convert/from_h5mu_or_h5ad_to_tiledb/from_h5mu_or_h5ad_to_tiledb"
viash_version: "0.9.4"
git_commit: "173327cc5670aa8bd5cf473827de80b602c90092"
git_remote: "https://github.com/openpipelines-bio/openpipeline"
git_tag: "0.2.0-2055-g173327cc"
package_config:
name: "openpipeline"
version: "main"
summary: "Best-practice workflows for single-cell multi-omics analyses.\n"
description: "OpenPipelines are extensible single cell analysis pipelines for reproducible\
\ and large-scale single cell processing using [Viash](https://viash.io) and [Nextflow](https://www.nextflow.io/).\n\
\nIn terms of workflows, the following has been made available, but keep in mind\
\ that\nindividual tools and functionality can be executed as standalone components\
\ as well.\n\n * Demultiplexing: conversion of raw sequencing data to FASTQ objects.\n\
\ * Ingestion: Read mapping and generating a count matrix.\n * Single sample\
\ processing: cell filtering and doublet detection.\n * Multisample processing:\
\ Count transformation, normalization, QC metric calulations.\n * Integration:\
\ Clustering, integration and batch correction using single and multimodal methods.\n\
\ * Downstream analysis workflows\n"
info:
test_resources:
- type: "s3"
path: "s3://openpipelines-data"
dest: "resources_test"
nextflow_labels_ci:
- path: "src/workflows/utils/labels_ci.config"
description: "Adds the correct memory and CPU labels when running on the Viash\
\ Hub CI."
viash_version: "0.9.4"
source: "src"
target: "target"
config_mods:
- ".resources += {path: '/src/workflows/utils/labels.config', dest: 'nextflow_labels.config'}\n\
.runners[.type == 'nextflow'].config.script := 'includeConfig(\"nextflow_labels.config\"\
)'\n"
- ".engines += { type: \"native\" }"
- ".engines[.type == 'docker'].target_registry := 'images.viash-hub.com'"
- ".engines[.type == 'docker'].target_tag := 'main'"
keywords:
- "single-cell"
- "multimodal"
license: "MIT"
organization: "vsh"
links:
repository: "https://github.com/openpipelines-bio/openpipeline"
docker_registry: "ghcr.io"
homepage: "https://openpipelines.bio"
documentation: "https://openpipelines.bio/fundamentals"
issue_tracker: "https://github.com/openpipelines-bio/openpipeline/issues"

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,48 @@
process {
// Default resources for components that hardly do any processing
memory = { 2.GB * task.attempt }
cpus = 1
// Retry for exit codes that have something to do with memory issues
errorStrategy = { task.exitStatus in 137..140 ? 'retry' : 'terminate' }
maxRetries = 3
// The memory a task is assinged increases with each attempt
// uncomment the line below and adjust the value to set a global upper limit on the memory.
// resourceLimits = [ memory: 240.Gb ]
// CPU resources
withLabel: singlecpu { cpus = 1 }
withLabel: lowcpu { cpus = 4 }
withLabel: midcpu { cpus = 10 }
withLabel: highcpu { cpus = 20 }
// Memory resources
withLabel: lowmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 4.GB * task.attempt } }
withLabel: midmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 25.GB * task.attempt } }
withLabel: highmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 50.GB * task.attempt } }
withLabel: veryhighmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 75.GB * task.attempt } }
// Disk space
withLabel: lowdisk {
disk = {process.disk ? process.disk : null}
}
withLabel: middisk {
disk = {process.disk ? process.disk : null}
}
withLabel: highdisk {
disk = {process.disk ? process.disk : null}
}
withLabel: veryhighdisk {
disk = {process.disk ? process.disk : null}
}
// NOTE: The above labels intentionally do not have an effect by default.
// The user should set the disk space requirements by adding the following
// to the compute environment:
//
// withLabel: lowdisk { disk = { 20.GB * task.attempt } }
// withLabel: middisk { disk = { 100.GB * task.attempt } }
// withLabel: highdisk { disk = { 200.GB * task.attempt } }
// withLabel: veryhighdisk { disk = { 500.GB * task.attempt } }
}

View File

@@ -0,0 +1,12 @@
def setup_logger():
import logging
from sys import stdout
logger = logging.getLogger()
logger.setLevel(logging.INFO)
console_handler = logging.StreamHandler(stdout)
logFormatter = logging.Formatter("%(asctime)s %(levelname)-8s %(message)s")
console_handler.setFormatter(logFormatter)
logger.addHandler(console_handler)
return logger

View File

@@ -0,0 +1,262 @@
name: "from_h5mu_to_h5ad"
namespace: "convert"
version: "main"
authors:
- name: "Robrecht Cannoodt"
roles:
- "maintainer"
info:
role: "Core Team Member"
links:
email: "robrecht@data-intuitive.com"
github: "rcannood"
orcid: "0000-0003-3641-729X"
linkedin: "robrechtcannoodt"
organizations:
- name: "Data Intuitive"
href: "https://www.data-intuitive.com"
role: "Data Science Engineer"
- name: "Open Problems"
href: "https://openproblems.bio"
role: "Core Member"
argument_groups:
- name: "Arguments"
arguments:
- type: "file"
name: "--input"
alternatives:
- "-i"
description: "Input MuData file"
info: null
default:
- "input.h5mu"
must_exist: true
create_parent: true
required: true
direction: "input"
multiple: false
multiple_sep: ";"
- type: "string"
name: "--modality"
description: "Which modality from the input MuData file to process.\n"
info: null
default:
- "rna"
required: false
direction: "input"
multiple: false
multiple_sep: ";"
- type: "file"
name: "--output"
alternatives:
- "-o"
description: "Output AnnData file."
info: null
default:
- "output.h5ad"
must_exist: true
create_parent: true
required: false
direction: "output"
multiple: false
multiple_sep: ";"
- type: "string"
name: "--output_compression"
description: "Compression format to use for the output AnnData and/or Mudata objects.\n\
By default no compression is applied.\n"
info: null
example:
- "gzip"
required: false
choices:
- "gzip"
- "lzf"
direction: "input"
multiple: false
multiple_sep: ";"
resources:
- type: "python_script"
path: "script.py"
is_executable: true
- type: "file"
path: "setup_logger.py"
- type: "file"
path: "nextflow_labels.config"
dest: "nextflow_labels.config"
description: "Converts a h5mu file into a h5ad file.\n"
test_resources:
- type: "python_script"
path: "test.py"
is_executable: true
- type: "file"
path: "pbmc_1k_protein_v3"
info: null
status: "enabled"
scope:
image: "public"
target: "public"
license: "MIT"
links:
repository: "https://github.com/openpipelines-bio/openpipeline"
docker_registry: "ghcr.io"
runners:
- type: "executable"
id: "executable"
docker_setup_strategy: "ifneedbepullelsecachedbuild"
- type: "nextflow"
id: "nextflow"
directives:
label:
- "lowmem"
- "singlecpu"
tag: "$id"
auto:
simplifyInput: true
simplifyOutput: false
transcript: false
publish: false
config:
labels:
mem1gb: "memory = 1000000000.B"
mem2gb: "memory = 2000000000.B"
mem5gb: "memory = 5000000000.B"
mem10gb: "memory = 10000000000.B"
mem20gb: "memory = 20000000000.B"
mem50gb: "memory = 50000000000.B"
mem100gb: "memory = 100000000000.B"
mem200gb: "memory = 200000000000.B"
mem500gb: "memory = 500000000000.B"
mem1tb: "memory = 1000000000000.B"
mem2tb: "memory = 2000000000000.B"
mem5tb: "memory = 5000000000000.B"
mem10tb: "memory = 10000000000000.B"
mem20tb: "memory = 20000000000000.B"
mem50tb: "memory = 50000000000000.B"
mem100tb: "memory = 100000000000000.B"
mem200tb: "memory = 200000000000000.B"
mem500tb: "memory = 500000000000000.B"
mem1gib: "memory = 1073741824.B"
mem2gib: "memory = 2147483648.B"
mem4gib: "memory = 4294967296.B"
mem8gib: "memory = 8589934592.B"
mem16gib: "memory = 17179869184.B"
mem32gib: "memory = 34359738368.B"
mem64gib: "memory = 68719476736.B"
mem128gib: "memory = 137438953472.B"
mem256gib: "memory = 274877906944.B"
mem512gib: "memory = 549755813888.B"
mem1tib: "memory = 1099511627776.B"
mem2tib: "memory = 2199023255552.B"
mem4tib: "memory = 4398046511104.B"
mem8tib: "memory = 8796093022208.B"
mem16tib: "memory = 17592186044416.B"
mem32tib: "memory = 35184372088832.B"
mem64tib: "memory = 70368744177664.B"
mem128tib: "memory = 140737488355328.B"
mem256tib: "memory = 281474976710656.B"
mem512tib: "memory = 562949953421312.B"
cpu1: "cpus = 1"
cpu2: "cpus = 2"
cpu5: "cpus = 5"
cpu10: "cpus = 10"
cpu20: "cpus = 20"
cpu50: "cpus = 50"
cpu100: "cpus = 100"
cpu200: "cpus = 200"
cpu500: "cpus = 500"
cpu1000: "cpus = 1000"
script:
- "includeConfig(\"nextflow_labels.config\")"
debug: false
container: "docker"
engines:
- type: "docker"
id: "docker"
image: "python:3.12-slim"
target_registry: "images.viash-hub.com"
target_tag: "main"
namespace_separator: "/"
setup:
- type: "apt"
packages:
- "procps"
interactive: false
- type: "python"
user: false
packages:
- "anndata~=0.11.1"
- "mudata~=0.3.1"
script:
- "exec(\"try:\\n import awkward\\nexcept ModuleNotFoundError:\\n exit(0)\\\
nelse: exit(1)\")"
upgrade: true
test_setup:
- type: "apt"
packages:
- "git"
interactive: false
- type: "python"
user: false
packages:
- "viashpy==0.8.0"
github:
- "openpipelines-bio/core#subdirectory=packages/python/openpipeline_testutils"
upgrade: true
entrypoint: []
cmd: null
- type: "native"
id: "native"
build_info:
config: "src/convert/from_h5mu_to_h5ad/config.vsh.yaml"
runner: "executable"
engine: "docker|native"
output: "target/executable/convert/from_h5mu_to_h5ad"
executable: "target/executable/convert/from_h5mu_to_h5ad/from_h5mu_to_h5ad"
viash_version: "0.9.4"
git_commit: "173327cc5670aa8bd5cf473827de80b602c90092"
git_remote: "https://github.com/openpipelines-bio/openpipeline"
git_tag: "0.2.0-2055-g173327cc"
package_config:
name: "openpipeline"
version: "main"
summary: "Best-practice workflows for single-cell multi-omics analyses.\n"
description: "OpenPipelines are extensible single cell analysis pipelines for reproducible\
\ and large-scale single cell processing using [Viash](https://viash.io) and [Nextflow](https://www.nextflow.io/).\n\
\nIn terms of workflows, the following has been made available, but keep in mind\
\ that\nindividual tools and functionality can be executed as standalone components\
\ as well.\n\n * Demultiplexing: conversion of raw sequencing data to FASTQ objects.\n\
\ * Ingestion: Read mapping and generating a count matrix.\n * Single sample\
\ processing: cell filtering and doublet detection.\n * Multisample processing:\
\ Count transformation, normalization, QC metric calulations.\n * Integration:\
\ Clustering, integration and batch correction using single and multimodal methods.\n\
\ * Downstream analysis workflows\n"
info:
test_resources:
- type: "s3"
path: "s3://openpipelines-data"
dest: "resources_test"
nextflow_labels_ci:
- path: "src/workflows/utils/labels_ci.config"
description: "Adds the correct memory and CPU labels when running on the Viash\
\ Hub CI."
viash_version: "0.9.4"
source: "src"
target: "target"
config_mods:
- ".resources += {path: '/src/workflows/utils/labels.config', dest: 'nextflow_labels.config'}\n\
.runners[.type == 'nextflow'].config.script := 'includeConfig(\"nextflow_labels.config\"\
)'\n"
- ".engines += { type: \"native\" }"
- ".engines[.type == 'docker'].target_registry := 'images.viash-hub.com'"
- ".engines[.type == 'docker'].target_tag := 'main'"
keywords:
- "single-cell"
- "multimodal"
license: "MIT"
organization: "vsh"
links:
repository: "https://github.com/openpipelines-bio/openpipeline"
docker_registry: "ghcr.io"
homepage: "https://openpipelines.bio"
documentation: "https://openpipelines.bio/fundamentals"
issue_tracker: "https://github.com/openpipelines-bio/openpipeline/issues"

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,48 @@
process {
// Default resources for components that hardly do any processing
memory = { 2.GB * task.attempt }
cpus = 1
// Retry for exit codes that have something to do with memory issues
errorStrategy = { task.exitStatus in 137..140 ? 'retry' : 'terminate' }
maxRetries = 3
// The memory a task is assinged increases with each attempt
// uncomment the line below and adjust the value to set a global upper limit on the memory.
// resourceLimits = [ memory: 240.Gb ]
// CPU resources
withLabel: singlecpu { cpus = 1 }
withLabel: lowcpu { cpus = 4 }
withLabel: midcpu { cpus = 10 }
withLabel: highcpu { cpus = 20 }
// Memory resources
withLabel: lowmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 4.GB * task.attempt } }
withLabel: midmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 25.GB * task.attempt } }
withLabel: highmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 50.GB * task.attempt } }
withLabel: veryhighmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 75.GB * task.attempt } }
// Disk space
withLabel: lowdisk {
disk = {process.disk ? process.disk : null}
}
withLabel: middisk {
disk = {process.disk ? process.disk : null}
}
withLabel: highdisk {
disk = {process.disk ? process.disk : null}
}
withLabel: veryhighdisk {
disk = {process.disk ? process.disk : null}
}
// NOTE: The above labels intentionally do not have an effect by default.
// The user should set the disk space requirements by adding the following
// to the compute environment:
//
// withLabel: lowdisk { disk = { 20.GB * task.attempt } }
// withLabel: middisk { disk = { 100.GB * task.attempt } }
// withLabel: highdisk { disk = { 200.GB * task.attempt } }
// withLabel: veryhighdisk { disk = { 500.GB * task.attempt } }
}

View File

@@ -0,0 +1,12 @@
def setup_logger():
import logging
from sys import stdout
logger = logging.getLogger()
logger.setLevel(logging.INFO)
console_handler = logging.StreamHandler(stdout)
logFormatter = logging.Formatter("%(asctime)s %(levelname)-8s %(message)s")
console_handler.setFormatter(logFormatter)
logger.addHandler(console_handler)
return logger

View File

@@ -0,0 +1,237 @@
name: "from_h5mu_to_seurat"
namespace: "convert"
version: "main"
authors:
- name: "Robrecht Cannoodt"
roles:
- "author"
- "maintainer"
info:
role: "Core Team Member"
links:
email: "robrecht@data-intuitive.com"
github: "rcannood"
orcid: "0000-0003-3641-729X"
linkedin: "robrechtcannoodt"
organizations:
- name: "Data Intuitive"
href: "https://www.data-intuitive.com"
role: "Data Science Engineer"
- name: "Open Problems"
href: "https://openproblems.bio"
role: "Core Member"
argument_groups:
- name: "Arguments"
arguments:
- type: "file"
name: "--input"
alternatives:
- "-i"
description: "Input h5mu file"
info: null
example:
- "input.h5mu"
must_exist: true
create_parent: true
required: true
direction: "input"
multiple: false
multiple_sep: ";"
- type: "file"
name: "--output"
alternatives:
- "-o"
description: "Output Seurat file"
info: null
example:
- "output.rds"
must_exist: true
create_parent: true
required: true
direction: "output"
multiple: false
multiple_sep: ";"
resources:
- type: "r_script"
path: "script.R"
is_executable: true
- type: "file"
path: "nextflow_labels.config"
dest: "nextflow_labels.config"
description: "Converts an h5mu file into a Seurat file.\n\nRestrictions:\n - Only\
\ the intersection of cells is currently loaded into the Seurat object due to the\
\ object structure limitation.\n - Multimodal embeddings (global .obsm slot) are\
\ loaded with the assay.used field set to the default assay.\n - Embeddings names\
\ are changed in order to comply with R & Seurat requirements and conventions.\n\
\ - Feature names with underscores ('_') are automatically replaced with dashes\
\ ('-')\n - Seurat does not support global variables metadata /var.\n"
test_resources:
- type: "r_script"
path: "run_test.R"
is_executable: true
- type: "file"
path: "10x_5k_anticmv"
info: null
status: "deprecated"
scope:
image: "public"
target: "public"
license: "MIT"
links:
repository: "https://github.com/openpipelines-bio/openpipeline"
docker_registry: "ghcr.io"
runners:
- type: "executable"
id: "executable"
docker_setup_strategy: "ifneedbepullelsecachedbuild"
- type: "nextflow"
id: "nextflow"
directives:
label:
- "lowmem"
- "singlecpu"
tag: "$id"
auto:
simplifyInput: true
simplifyOutput: false
transcript: false
publish: false
config:
labels:
mem1gb: "memory = 1000000000.B"
mem2gb: "memory = 2000000000.B"
mem5gb: "memory = 5000000000.B"
mem10gb: "memory = 10000000000.B"
mem20gb: "memory = 20000000000.B"
mem50gb: "memory = 50000000000.B"
mem100gb: "memory = 100000000000.B"
mem200gb: "memory = 200000000000.B"
mem500gb: "memory = 500000000000.B"
mem1tb: "memory = 1000000000000.B"
mem2tb: "memory = 2000000000000.B"
mem5tb: "memory = 5000000000000.B"
mem10tb: "memory = 10000000000000.B"
mem20tb: "memory = 20000000000000.B"
mem50tb: "memory = 50000000000000.B"
mem100tb: "memory = 100000000000000.B"
mem200tb: "memory = 200000000000000.B"
mem500tb: "memory = 500000000000000.B"
mem1gib: "memory = 1073741824.B"
mem2gib: "memory = 2147483648.B"
mem4gib: "memory = 4294967296.B"
mem8gib: "memory = 8589934592.B"
mem16gib: "memory = 17179869184.B"
mem32gib: "memory = 34359738368.B"
mem64gib: "memory = 68719476736.B"
mem128gib: "memory = 137438953472.B"
mem256gib: "memory = 274877906944.B"
mem512gib: "memory = 549755813888.B"
mem1tib: "memory = 1099511627776.B"
mem2tib: "memory = 2199023255552.B"
mem4tib: "memory = 4398046511104.B"
mem8tib: "memory = 8796093022208.B"
mem16tib: "memory = 17592186044416.B"
mem32tib: "memory = 35184372088832.B"
mem64tib: "memory = 70368744177664.B"
mem128tib: "memory = 140737488355328.B"
mem256tib: "memory = 281474976710656.B"
mem512tib: "memory = 562949953421312.B"
cpu1: "cpus = 1"
cpu2: "cpus = 2"
cpu5: "cpus = 5"
cpu10: "cpus = 10"
cpu20: "cpus = 20"
cpu50: "cpus = 50"
cpu100: "cpus = 100"
cpu200: "cpus = 200"
cpu500: "cpus = 500"
cpu1000: "cpus = 1000"
script:
- "includeConfig(\"nextflow_labels.config\")"
debug: false
container: "docker"
engines:
- type: "docker"
id: "docker"
image: "rocker/r2u:24.04"
target_registry: "images.viash-hub.com"
target_tag: "main"
namespace_separator: "/"
setup:
- type: "apt"
packages:
- "libhdf5-dev"
- "libgeos-dev"
interactive: false
- type: "r"
cran:
- "anndata"
- "hdf5r"
- "testthat"
- "SeuratObject"
- "Seurat"
bioc_force_install: false
warnings_as_errors: true
- type: "r"
github:
- "pmbio/MuDataSeurat@empty-tables-and-nullable"
bioc_force_install: false
warnings_as_errors: true
entrypoint: []
cmd: null
- type: "native"
id: "native"
build_info:
config: "src/convert/from_h5mu_to_seurat/config.vsh.yaml"
runner: "executable"
engine: "docker|native"
output: "target/executable/convert/from_h5mu_to_seurat"
executable: "target/executable/convert/from_h5mu_to_seurat/from_h5mu_to_seurat"
viash_version: "0.9.4"
git_commit: "173327cc5670aa8bd5cf473827de80b602c90092"
git_remote: "https://github.com/openpipelines-bio/openpipeline"
git_tag: "0.2.0-2055-g173327cc"
package_config:
name: "openpipeline"
version: "main"
summary: "Best-practice workflows for single-cell multi-omics analyses.\n"
description: "OpenPipelines are extensible single cell analysis pipelines for reproducible\
\ and large-scale single cell processing using [Viash](https://viash.io) and [Nextflow](https://www.nextflow.io/).\n\
\nIn terms of workflows, the following has been made available, but keep in mind\
\ that\nindividual tools and functionality can be executed as standalone components\
\ as well.\n\n * Demultiplexing: conversion of raw sequencing data to FASTQ objects.\n\
\ * Ingestion: Read mapping and generating a count matrix.\n * Single sample\
\ processing: cell filtering and doublet detection.\n * Multisample processing:\
\ Count transformation, normalization, QC metric calulations.\n * Integration:\
\ Clustering, integration and batch correction using single and multimodal methods.\n\
\ * Downstream analysis workflows\n"
info:
test_resources:
- type: "s3"
path: "s3://openpipelines-data"
dest: "resources_test"
nextflow_labels_ci:
- path: "src/workflows/utils/labels_ci.config"
description: "Adds the correct memory and CPU labels when running on the Viash\
\ Hub CI."
viash_version: "0.9.4"
source: "src"
target: "target"
config_mods:
- ".resources += {path: '/src/workflows/utils/labels.config', dest: 'nextflow_labels.config'}\n\
.runners[.type == 'nextflow'].config.script := 'includeConfig(\"nextflow_labels.config\"\
)'\n"
- ".engines += { type: \"native\" }"
- ".engines[.type == 'docker'].target_registry := 'images.viash-hub.com'"
- ".engines[.type == 'docker'].target_tag := 'main'"
keywords:
- "single-cell"
- "multimodal"
license: "MIT"
organization: "vsh"
links:
repository: "https://github.com/openpipelines-bio/openpipeline"
docker_registry: "ghcr.io"
homepage: "https://openpipelines.bio"
documentation: "https://openpipelines.bio/fundamentals"
issue_tracker: "https://github.com/openpipelines-bio/openpipeline/issues"

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,48 @@
process {
// Default resources for components that hardly do any processing
memory = { 2.GB * task.attempt }
cpus = 1
// Retry for exit codes that have something to do with memory issues
errorStrategy = { task.exitStatus in 137..140 ? 'retry' : 'terminate' }
maxRetries = 3
// The memory a task is assinged increases with each attempt
// uncomment the line below and adjust the value to set a global upper limit on the memory.
// resourceLimits = [ memory: 240.Gb ]
// CPU resources
withLabel: singlecpu { cpus = 1 }
withLabel: lowcpu { cpus = 4 }
withLabel: midcpu { cpus = 10 }
withLabel: highcpu { cpus = 20 }
// Memory resources
withLabel: lowmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 4.GB * task.attempt } }
withLabel: midmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 25.GB * task.attempt } }
withLabel: highmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 50.GB * task.attempt } }
withLabel: veryhighmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 75.GB * task.attempt } }
// Disk space
withLabel: lowdisk {
disk = {process.disk ? process.disk : null}
}
withLabel: middisk {
disk = {process.disk ? process.disk : null}
}
withLabel: highdisk {
disk = {process.disk ? process.disk : null}
}
withLabel: veryhighdisk {
disk = {process.disk ? process.disk : null}
}
// NOTE: The above labels intentionally do not have an effect by default.
// The user should set the disk space requirements by adding the following
// to the compute environment:
//
// withLabel: lowdisk { disk = { 20.GB * task.attempt } }
// withLabel: middisk { disk = { 100.GB * task.attempt } }
// withLabel: highdisk { disk = { 200.GB * task.attempt } }
// withLabel: veryhighdisk { disk = { 500.GB * task.attempt } }
}

View File

@@ -0,0 +1,322 @@
name: "velocyto_to_h5mu"
namespace: "convert"
version: "main"
authors:
- name: "Dries Schaumont"
roles:
- "maintainer"
- "author"
info:
role: "Core Team Member"
links:
email: "dries@data-intuitive.com"
github: "DriesSchaumont"
orcid: "0000-0002-4389-0440"
linkedin: "dries-schaumont"
organizations:
- name: "Data Intuitive"
href: "https://www.data-intuitive.com"
role: "Data Scientist"
- name: "Robrecht Cannoodt"
roles:
- "author"
info:
role: "Core Team Member"
links:
email: "robrecht@data-intuitive.com"
github: "rcannood"
orcid: "0000-0003-3641-729X"
linkedin: "robrechtcannoodt"
organizations:
- name: "Data Intuitive"
href: "https://www.data-intuitive.com"
role: "Data Science Engineer"
- name: "Open Problems"
href: "https://openproblems.bio"
role: "Core Member"
- name: "Angela Oliveira Pisco"
roles:
- "contributor"
info:
role: "Contributor"
links:
github: "aopisco"
orcid: "0000-0003-0142-2355"
linkedin: "aopisco"
organizations:
- name: "Insitro"
href: "https://insitro.com"
role: "Director of Computational Biology"
- name: "Open Problems"
href: "https://openproblems.bio"
role: "Core Member"
argument_groups:
- name: "Inputs"
arguments:
- type: "file"
name: "--input_loom"
description: "Path to the input loom file."
info: null
example:
- "input.loom"
must_exist: true
create_parent: true
required: true
direction: "input"
multiple: false
multiple_sep: ";"
- type: "file"
name: "--input_h5mu"
description: "If a MuData file is provided,"
info: null
example:
- "input.h5mu"
must_exist: true
create_parent: true
required: false
direction: "input"
multiple: false
multiple_sep: ";"
- type: "string"
name: "--modality"
description: "The name of the modality to operate on."
info: null
default:
- "rna_velocity"
required: false
direction: "input"
multiple: false
multiple_sep: ";"
- name: "Outputs"
arguments:
- type: "file"
name: "--output"
description: "Path to the output MuData file."
info: null
example:
- "output.h5mu"
must_exist: true
create_parent: true
required: false
direction: "output"
multiple: false
multiple_sep: ";"
- type: "string"
name: "--layer_spliced"
description: "Output layer for the spliced reads."
info: null
default:
- "velo_spliced"
required: false
direction: "input"
multiple: false
multiple_sep: ";"
- type: "string"
name: "--layer_unspliced"
description: "Output layer for the unspliced reads."
info: null
default:
- "velo_unspliced"
required: false
direction: "input"
multiple: false
multiple_sep: ";"
- type: "string"
name: "--layer_ambiguous"
description: "Output layer for the ambiguous reads."
info: null
default:
- "velo_ambiguous"
required: false
direction: "input"
multiple: false
multiple_sep: ";"
- type: "string"
name: "--output_compression"
description: "Compression format to use for the output AnnData and/or Mudata objects.\n\
By default no compression is applied.\n"
info: null
example:
- "gzip"
required: false
choices:
- "gzip"
- "lzf"
direction: "input"
multiple: false
multiple_sep: ";"
resources:
- type: "python_script"
path: "script.py"
is_executable: true
- type: "file"
path: "nextflow_labels.config"
dest: "nextflow_labels.config"
description: "Convert a velocyto loom file to a h5mu file.\n\nIf an input h5mu file\
\ is also provided, the velocity\nh5ad object will get added to that h5mu instead.\n"
test_resources:
- type: "python_script"
path: "test.py"
is_executable: true
- type: "file"
path: "cellranger_tiny_fastq"
info: null
status: "enabled"
scope:
image: "public"
target: "public"
license: "MIT"
links:
repository: "https://github.com/openpipelines-bio/openpipeline"
docker_registry: "ghcr.io"
runners:
- type: "executable"
id: "executable"
docker_setup_strategy: "ifneedbepullelsecachedbuild"
- type: "nextflow"
id: "nextflow"
directives:
label:
- "lowmem"
- "lowcpu"
tag: "$id"
auto:
simplifyInput: true
simplifyOutput: false
transcript: false
publish: false
config:
labels:
mem1gb: "memory = 1000000000.B"
mem2gb: "memory = 2000000000.B"
mem5gb: "memory = 5000000000.B"
mem10gb: "memory = 10000000000.B"
mem20gb: "memory = 20000000000.B"
mem50gb: "memory = 50000000000.B"
mem100gb: "memory = 100000000000.B"
mem200gb: "memory = 200000000000.B"
mem500gb: "memory = 500000000000.B"
mem1tb: "memory = 1000000000000.B"
mem2tb: "memory = 2000000000000.B"
mem5tb: "memory = 5000000000000.B"
mem10tb: "memory = 10000000000000.B"
mem20tb: "memory = 20000000000000.B"
mem50tb: "memory = 50000000000000.B"
mem100tb: "memory = 100000000000000.B"
mem200tb: "memory = 200000000000000.B"
mem500tb: "memory = 500000000000000.B"
mem1gib: "memory = 1073741824.B"
mem2gib: "memory = 2147483648.B"
mem4gib: "memory = 4294967296.B"
mem8gib: "memory = 8589934592.B"
mem16gib: "memory = 17179869184.B"
mem32gib: "memory = 34359738368.B"
mem64gib: "memory = 68719476736.B"
mem128gib: "memory = 137438953472.B"
mem256gib: "memory = 274877906944.B"
mem512gib: "memory = 549755813888.B"
mem1tib: "memory = 1099511627776.B"
mem2tib: "memory = 2199023255552.B"
mem4tib: "memory = 4398046511104.B"
mem8tib: "memory = 8796093022208.B"
mem16tib: "memory = 17592186044416.B"
mem32tib: "memory = 35184372088832.B"
mem64tib: "memory = 70368744177664.B"
mem128tib: "memory = 140737488355328.B"
mem256tib: "memory = 281474976710656.B"
mem512tib: "memory = 562949953421312.B"
cpu1: "cpus = 1"
cpu2: "cpus = 2"
cpu5: "cpus = 5"
cpu10: "cpus = 10"
cpu20: "cpus = 20"
cpu50: "cpus = 50"
cpu100: "cpus = 100"
cpu200: "cpus = 200"
cpu500: "cpus = 500"
cpu1000: "cpus = 1000"
script:
- "includeConfig(\"nextflow_labels.config\")"
debug: false
container: "docker"
engines:
- type: "docker"
id: "docker"
image: "python:3.12-slim"
target_registry: "images.viash-hub.com"
target_tag: "main"
namespace_separator: "/"
setup:
- type: "apt"
packages:
- "procps"
interactive: false
- type: "python"
user: false
packages:
- "anndata~=0.11.1"
- "mudata~=0.3.1"
- "scanpy~=1.10.4"
- "loompy"
script:
- "exec(\"try:\\n import awkward\\nexcept ModuleNotFoundError:\\n exit(0)\\\
nelse: exit(1)\")"
upgrade: true
entrypoint: []
cmd: null
- type: "native"
id: "native"
build_info:
config: "src/velocity/velocyto_to_h5mu/config.vsh.yaml"
runner: "executable"
engine: "docker|native"
output: "target/executable/convert/velocyto_to_h5mu"
executable: "target/executable/convert/velocyto_to_h5mu/velocyto_to_h5mu"
viash_version: "0.9.4"
git_commit: "173327cc5670aa8bd5cf473827de80b602c90092"
git_remote: "https://github.com/openpipelines-bio/openpipeline"
git_tag: "0.2.0-2055-g173327cc"
package_config:
name: "openpipeline"
version: "main"
summary: "Best-practice workflows for single-cell multi-omics analyses.\n"
description: "OpenPipelines are extensible single cell analysis pipelines for reproducible\
\ and large-scale single cell processing using [Viash](https://viash.io) and [Nextflow](https://www.nextflow.io/).\n\
\nIn terms of workflows, the following has been made available, but keep in mind\
\ that\nindividual tools and functionality can be executed as standalone components\
\ as well.\n\n * Demultiplexing: conversion of raw sequencing data to FASTQ objects.\n\
\ * Ingestion: Read mapping and generating a count matrix.\n * Single sample\
\ processing: cell filtering and doublet detection.\n * Multisample processing:\
\ Count transformation, normalization, QC metric calulations.\n * Integration:\
\ Clustering, integration and batch correction using single and multimodal methods.\n\
\ * Downstream analysis workflows\n"
info:
test_resources:
- type: "s3"
path: "s3://openpipelines-data"
dest: "resources_test"
nextflow_labels_ci:
- path: "src/workflows/utils/labels_ci.config"
description: "Adds the correct memory and CPU labels when running on the Viash\
\ Hub CI."
viash_version: "0.9.4"
source: "src"
target: "target"
config_mods:
- ".resources += {path: '/src/workflows/utils/labels.config', dest: 'nextflow_labels.config'}\n\
.runners[.type == 'nextflow'].config.script := 'includeConfig(\"nextflow_labels.config\"\
)'\n"
- ".engines += { type: \"native\" }"
- ".engines[.type == 'docker'].target_registry := 'images.viash-hub.com'"
- ".engines[.type == 'docker'].target_tag := 'main'"
keywords:
- "single-cell"
- "multimodal"
license: "MIT"
organization: "vsh"
links:
repository: "https://github.com/openpipelines-bio/openpipeline"
docker_registry: "ghcr.io"
homepage: "https://openpipelines.bio"
documentation: "https://openpipelines.bio/fundamentals"
issue_tracker: "https://github.com/openpipelines-bio/openpipeline/issues"

View File

@@ -0,0 +1,48 @@
process {
// Default resources for components that hardly do any processing
memory = { 2.GB * task.attempt }
cpus = 1
// Retry for exit codes that have something to do with memory issues
errorStrategy = { task.exitStatus in 137..140 ? 'retry' : 'terminate' }
maxRetries = 3
// The memory a task is assinged increases with each attempt
// uncomment the line below and adjust the value to set a global upper limit on the memory.
// resourceLimits = [ memory: 240.Gb ]
// CPU resources
withLabel: singlecpu { cpus = 1 }
withLabel: lowcpu { cpus = 4 }
withLabel: midcpu { cpus = 10 }
withLabel: highcpu { cpus = 20 }
// Memory resources
withLabel: lowmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 4.GB * task.attempt } }
withLabel: midmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 25.GB * task.attempt } }
withLabel: highmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 50.GB * task.attempt } }
withLabel: veryhighmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 75.GB * task.attempt } }
// Disk space
withLabel: lowdisk {
disk = {process.disk ? process.disk : null}
}
withLabel: middisk {
disk = {process.disk ? process.disk : null}
}
withLabel: highdisk {
disk = {process.disk ? process.disk : null}
}
withLabel: veryhighdisk {
disk = {process.disk ? process.disk : null}
}
// NOTE: The above labels intentionally do not have an effect by default.
// The user should set the disk space requirements by adding the following
// to the compute environment:
//
// withLabel: lowdisk { disk = { 20.GB * task.attempt } }
// withLabel: middisk { disk = { 100.GB * task.attempt } }
// withLabel: highdisk { disk = { 200.GB * task.attempt } }
// withLabel: veryhighdisk { disk = { 500.GB * task.attempt } }
}

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,660 @@
name: "cellbender_remove_background"
namespace: "correction"
version: "main"
argument_groups:
- name: "Inputs"
arguments:
- type: "file"
name: "--input"
alternatives:
- "-i"
description: "Input h5mu file. Data file on which to run tool. Data must be un-filtered:\
\ it should include empty droplets."
info: null
example:
- "input.h5mu"
must_exist: true
create_parent: true
required: true
direction: "input"
multiple: false
multiple_sep: ";"
- type: "string"
name: "--modality"
description: "List of modalities to process."
info: null
default:
- "rna"
required: false
direction: "input"
multiple: false
multiple_sep: ";"
- name: "Outputs"
arguments:
- type: "file"
name: "--output"
alternatives:
- "-o"
description: "Full count matrix as an h5mu file, with background RNA removed.\
\ This file contains all the original droplet barcodes."
info: null
example:
- "output.h5mu"
must_exist: true
create_parent: true
required: true
direction: "output"
multiple: false
multiple_sep: ";"
- type: "string"
name: "--layer_output"
description: "Output layer"
info: null
default:
- "cellbender_corrected"
required: false
direction: "input"
multiple: false
multiple_sep: ";"
- type: "string"
name: "--obs_background_fraction"
info: null
default:
- "cellbender_background_fraction"
required: false
direction: "input"
multiple: false
multiple_sep: ";"
- type: "string"
name: "--obs_cell_probability"
info: null
default:
- "cellbender_cell_probability"
required: false
direction: "input"
multiple: false
multiple_sep: ";"
- type: "string"
name: "--obs_cell_size"
info: null
default:
- "cellbender_cell_size"
required: false
direction: "input"
multiple: false
multiple_sep: ";"
- type: "string"
name: "--obs_droplet_efficiency"
description: "Name of the column in the .obs dataframe to store the droplet efficiencies\
\ in.\n"
info: null
default:
- "cellbender_droplet_efficiency"
required: false
direction: "input"
multiple: false
multiple_sep: ";"
- type: "string"
name: "--obs_latent_scale"
info: null
default:
- "cellbender_latent_scale"
required: false
direction: "input"
multiple: false
multiple_sep: ";"
- type: "string"
name: "--var_ambient_expression"
info: null
default:
- "cellbender_ambient_expression"
required: false
direction: "input"
multiple: false
multiple_sep: ";"
- type: "string"
name: "--obsm_gene_expression_encoding"
info: null
default:
- "cellbender_gene_expression_encoding"
required: false
direction: "input"
multiple: false
multiple_sep: ";"
- type: "string"
name: "--output_compression"
description: "Compression format to use for the output AnnData and/or Mudata objects.\n\
By default no compression is applied.\n"
info: null
example:
- "gzip"
required: false
choices:
- "gzip"
- "lzf"
direction: "input"
multiple: false
multiple_sep: ";"
- name: "Arguments"
arguments:
- type: "boolean"
name: "--expected_cells_from_qc"
description: "Will use the Cell Ranger QC to determine the estimated number of\
\ cells"
info: null
default:
- false
required: false
direction: "input"
multiple: false
multiple_sep: ";"
- type: "integer"
name: "--expected_cells"
description: "Number of cells expected in the dataset (a rough estimate within\
\ a factor of 2 is sufficient)."
info: null
example:
- 1000
required: false
direction: "input"
multiple: false
multiple_sep: ";"
- type: "integer"
name: "--total_droplets_included"
description: "The number of droplets from the rank-ordered UMI plot\nthat will\
\ have their cell probabilities inferred as an\noutput. Include the droplets\
\ which might contain cells.\nDroplets beyond TOTAL_DROPLETS_INCLUDED should\
\ be\n'surely empty' droplets.\n"
info: null
example:
- 25000
required: false
direction: "input"
multiple: false
multiple_sep: ";"
- type: "integer"
name: "--force_cell_umi_prior"
description: "Ignore CellBender's heuristic prior estimation, and use this prior\
\ for UMI counts in cells."
info: null
required: false
direction: "input"
multiple: false
multiple_sep: ";"
- type: "integer"
name: "--force_empty_umi_prior"
description: "Ignore CellBender's heuristic prior estimation, and use this prior\
\ for UMI counts in empty droplets."
info: null
required: false
direction: "input"
multiple: false
multiple_sep: ";"
- type: "string"
name: "--model"
description: "Which model is being used for count data.\n\n* 'naive' subtracts\
\ the estimated ambient profile.\n* 'simple' does not model either ambient RNA\
\ or random barcode swapping (for debugging purposes -- not recommended).\n\
* 'ambient' assumes background RNA is incorporated into droplets.\n* 'swapping'\
\ assumes background RNA comes from random barcode swapping (via PCR chimeras).\n\
* 'full' uses a combined ambient and swapping model.\n"
info: null
default:
- "full"
required: false
choices:
- "naive"
- "simple"
- "ambient"
- "swapping"
- "full"
direction: "input"
multiple: false
multiple_sep: ";"
- type: "integer"
name: "--epochs"
description: "Number of epochs to train."
info: null
default:
- 150
required: false
direction: "input"
multiple: false
multiple_sep: ";"
- type: "integer"
name: "--low_count_threshold"
description: "Droplets with UMI counts below this number are completely \nexcluded\
\ from the analysis. This can help identify the correct \nprior for empty droplet\
\ counts in the rare case where empty \ncounts are extremely high (over 200).\n"
info: null
default:
- 5
required: false
direction: "input"
multiple: false
multiple_sep: ";"
- type: "integer"
name: "--z_dim"
description: "Dimension of latent variable z.\n"
info: null
default:
- 64
required: false
direction: "input"
multiple: false
multiple_sep: ";"
- type: "integer"
name: "--z_layers"
description: "Dimension of hidden layers in the encoder for z.\n"
info: null
default:
- 512
required: false
direction: "input"
multiple: true
multiple_sep: ";"
- type: "double"
name: "--training_fraction"
description: "Training detail: the fraction of the data used for training.\nThe\
\ rest is never seen by the inference algorithm. Speeds up learning.\n"
info: null
default:
- 0.9
required: false
direction: "input"
multiple: false
multiple_sep: ";"
- type: "double"
name: "--empty_drop_training_fraction"
description: "Training detail: the fraction of the training data each epoch that\
\ \nis drawn (randomly sampled) from surely empty droplets.\n"
info: null
default:
- 0.2
required: false
direction: "input"
multiple: false
multiple_sep: ";"
- type: "integer"
name: "--ignore_features"
description: "Integer indices of features to ignore entirely. In the output\n\
count matrix, the counts for these features will be unchanged.\n"
info: null
required: false
direction: "input"
multiple: true
multiple_sep: ";"
- type: "double"
name: "--fpr"
description: "Target 'delta' false positive rate in [0, 1). Use 0 for a cohort\n\
of samples which will be jointly analyzed for differential expression.\nA false\
\ positive is a true signal count that is erroneously removed.\nMore background\
\ removal is accompanied by more signal removal at\nhigh values of FPR. You\
\ can specify multiple values, which will\ncreate multiple output files.\n"
info: null
default:
- 0.01
required: false
direction: "input"
multiple: true
multiple_sep: ";"
- type: "string"
name: "--exclude_feature_types"
description: "Feature types to ignore during the analysis. These features will\n\
be left unchanged in the output file.\n"
info: null
required: false
direction: "input"
multiple: true
multiple_sep: ";"
- type: "double"
name: "--projected_ambient_count_threshold"
description: "Controls how many features are included in the analysis, which\n\
can lead to a large speedup. If a feature is expected to have less\nthan PROJECTED_AMBIENT_COUNT_THRESHOLD\
\ counts total in all cells\n(summed), then that gene is excluded, and it will\
\ be unchanged\nin the output count matrix. For example, \nPROJECTED_AMBIENT_COUNT_THRESHOLD\
\ = 0 will include all features\nwhich have even a single count in any empty\
\ droplet.\n"
info: null
default:
- 0.1
required: false
direction: "input"
multiple: false
multiple_sep: ";"
- type: "double"
name: "--learning_rate"
description: "Training detail: lower learning rate for inference.\nA OneCycle\
\ learning rate schedule is used, where the\nupper learning rate is ten times\
\ this value. (For this\nvalue, probably do not exceed 1e-3).\n"
info: null
default:
- 1.0E-4
required: false
direction: "input"
multiple: false
multiple_sep: ";"
- type: "double"
name: "--final_elbo_fail_fraction"
description: "Training is considered to have failed if \n(best_test_ELBO - final_test_ELBO)/(best_test_ELBO\
\ - initial_test_ELBO) > FINAL_ELBO_FAIL_FRACTION.\nTraining will automatically\
\ re-run if --num-training-tries > 1.\nBy default, will not fail training based\
\ on final_training_ELBO.\n"
info: null
required: false
direction: "input"
multiple: false
multiple_sep: ";"
- type: "double"
name: "--epoch_elbo_fail_fraction"
description: "Training is considered to have failed if \n(previous_epoch_test_ELBO\
\ - current_epoch_test_ELBO)/(previous_epoch_test_ELBO - initial_train_ELBO)\
\ > EPOCH_ELBO_FAIL_FRACTION.\nTraining will automatically re-run if --num-training-tries\
\ > 1.\nBy default, will not fail training based on epoch_training_ELBO.\n"
info: null
required: false
direction: "input"
multiple: false
multiple_sep: ";"
- type: "integer"
name: "--num_training_tries"
description: "Number of times to attempt to train the model. At each subsequent\
\ attempt,\nthe learning rate is multiplied by LEARNING_RATE_RETRY_MULT.\n"
info: null
default:
- 1
required: false
direction: "input"
multiple: false
multiple_sep: ";"
- type: "double"
name: "--learning_rate_retry_mult"
description: "Learning rate is multiplied by this amount each time a new training\n\
attempt is made. (This parameter is only used if training fails based\non EPOCH_ELBO_FAIL_FRACTION\
\ or FINAL_ELBO_FAIL_FRACTION and\nNUM_TRAINING_TRIES is > 1.) \n"
info: null
default:
- 0.2
required: false
direction: "input"
multiple: false
multiple_sep: ";"
- type: "integer"
name: "--posterior_batch_size"
description: "Training detail: size of batches when creating the posterior.\n\
Reduce this to avoid running out of GPU memory creating the posterior\n(will\
\ be slower).\n"
info: null
default:
- 128
required: false
direction: "input"
multiple: false
multiple_sep: ";"
- type: "string"
name: "--posterior_regulation"
description: "Posterior regularization method. (For experts: not required for\
\ normal usage,\nsee documentation). \n\n* PRq is approximate quantile-targeting.\n\
* PRmu is approximate mean-targeting aggregated over genes (behavior of v0.2.0).\n\
* PRmu_gene is approximate mean-targeting per gene.\n"
info: null
required: false
choices:
- "PRq"
- "PRmu"
- "PRmu_gene"
direction: "input"
multiple: false
multiple_sep: ";"
- type: "double"
name: "--alpha"
description: "Tunable parameter alpha for the PRq posterior regularization method\n\
(not normally used: see documentation).\n"
info: null
required: false
direction: "input"
multiple: false
multiple_sep: ";"
- type: "double"
name: "--q"
description: "Tunable parameter q for the CDF threshold estimation method (not\n\
normally used: see documentation).\n"
info: null
required: false
direction: "input"
multiple: false
multiple_sep: ";"
- type: "string"
name: "--estimator"
description: "Output denoised count estimation method. (For experts: not required\n\
for normal usage, see documentation).\n"
info: null
default:
- "mckp"
required: false
choices:
- "map"
- "mean"
- "cdf"
- "sample"
- "mckp"
direction: "input"
multiple: false
multiple_sep: ";"
- type: "boolean_true"
name: "--estimator_multiple_cpu"
description: "Including the flag --estimator-multiple-cpu will use more than one\n\
CPU to compute the MCKP output count estimator in parallel (does nothing\nfor\
\ other estimators).\n"
info: null
direction: "input"
- type: "boolean"
name: "--constant_learning_rate"
description: "Including the flag --constant-learning-rate will use the ClippedAdam\n\
optimizer instead of the OneCycleLR learning rate schedule, which is\nthe default.\
\ Learning is faster with the OneCycleLR schedule.\nHowever, training can easily\
\ be continued from a checkpoint for more\nepochs than the initial command specified\
\ when using ClippedAdam. On\nthe other hand, if using the OneCycleLR schedule\
\ with 150 epochs\nspecified, it is not possible to pick up from that final\
\ checkpoint\nand continue training until 250 epochs.\n"
info: null
required: false
direction: "input"
multiple: false
multiple_sep: ";"
- type: "boolean_true"
name: "--debug"
description: "Including the flag --debug will log extra messages useful for debugging.\n"
info: null
direction: "input"
- type: "boolean_true"
name: "--cuda"
description: "Including the flag --cuda will run the inference on a\nGPU.\n"
info: null
direction: "input"
resources:
- type: "python_script"
path: "script.py"
is_executable: true
- type: "file"
path: "setup_logger.py"
- type: "file"
path: "nextflow_labels.config"
dest: "nextflow_labels.config"
description: "Eliminating technical artifacts from high-throughput single-cell RNA\
\ sequencing data.\n\nThis module removes counts due to ambient RNA molecules and\
\ random barcode swapping from (raw) UMI-based scRNA-seq count matrices. \nAt the\
\ moment, only the count matrices produced by the CellRanger count pipeline is supported.\
\ Support for additional tools and protocols \nwill be added in the future. A quick\
\ start tutorial can be found here.\n\nFleming et al. 2022, bioRxiv.\n"
test_resources:
- type: "python_script"
path: "test.py"
is_executable: true
- type: "file"
path: "pbmc_1k_protein_v3_filtered_feature_bc_matrix.h5mu"
info: null
status: "enabled"
scope:
image: "public"
target: "public"
license: "MIT"
links:
repository: "https://github.com/openpipelines-bio/openpipeline"
docker_registry: "ghcr.io"
runners:
- type: "executable"
id: "executable"
docker_setup_strategy: "ifneedbepullelsecachedbuild"
- type: "nextflow"
id: "nextflow"
directives:
label:
- "midcpu"
- "midmem"
- "gpu"
tag: "$id"
auto:
simplifyInput: true
simplifyOutput: false
transcript: false
publish: false
config:
labels:
mem1gb: "memory = 1000000000.B"
mem2gb: "memory = 2000000000.B"
mem5gb: "memory = 5000000000.B"
mem10gb: "memory = 10000000000.B"
mem20gb: "memory = 20000000000.B"
mem50gb: "memory = 50000000000.B"
mem100gb: "memory = 100000000000.B"
mem200gb: "memory = 200000000000.B"
mem500gb: "memory = 500000000000.B"
mem1tb: "memory = 1000000000000.B"
mem2tb: "memory = 2000000000000.B"
mem5tb: "memory = 5000000000000.B"
mem10tb: "memory = 10000000000000.B"
mem20tb: "memory = 20000000000000.B"
mem50tb: "memory = 50000000000000.B"
mem100tb: "memory = 100000000000000.B"
mem200tb: "memory = 200000000000000.B"
mem500tb: "memory = 500000000000000.B"
mem1gib: "memory = 1073741824.B"
mem2gib: "memory = 2147483648.B"
mem4gib: "memory = 4294967296.B"
mem8gib: "memory = 8589934592.B"
mem16gib: "memory = 17179869184.B"
mem32gib: "memory = 34359738368.B"
mem64gib: "memory = 68719476736.B"
mem128gib: "memory = 137438953472.B"
mem256gib: "memory = 274877906944.B"
mem512gib: "memory = 549755813888.B"
mem1tib: "memory = 1099511627776.B"
mem2tib: "memory = 2199023255552.B"
mem4tib: "memory = 4398046511104.B"
mem8tib: "memory = 8796093022208.B"
mem16tib: "memory = 17592186044416.B"
mem32tib: "memory = 35184372088832.B"
mem64tib: "memory = 70368744177664.B"
mem128tib: "memory = 140737488355328.B"
mem256tib: "memory = 281474976710656.B"
mem512tib: "memory = 562949953421312.B"
cpu1: "cpus = 1"
cpu2: "cpus = 2"
cpu5: "cpus = 5"
cpu10: "cpus = 10"
cpu20: "cpus = 20"
cpu50: "cpus = 50"
cpu100: "cpus = 100"
cpu200: "cpus = 200"
cpu500: "cpus = 500"
cpu1000: "cpus = 1000"
script:
- "includeConfig(\"nextflow_labels.config\")"
debug: false
container: "docker"
engines:
- type: "docker"
id: "docker"
image: "nvcr.io/nvidia/cuda:11.8.0-devel-ubuntu22.04"
target_registry: "images.viash-hub.com"
target_tag: "main"
namespace_separator: "/"
setup:
- type: "docker"
run:
- "apt update && DEBIAN_FRONTEND=noninteractive apt install -y make build-essential\
\ libssl-dev zlib1g-dev libbz2-dev libreadline-dev libsqlite3-dev wget ca-certificates\
\ curl llvm libncurses5-dev xz-utils tk-dev libxml2-dev libxmlsec1-dev libffi-dev\
\ liblzma-dev mecab-ipadic-utf8 git \\\n&& curl https://pyenv.run | bash \\\n\
&& pyenv update \\\n&& pyenv install $PYTHON_VERSION \\\n&& pyenv global $PYTHON_VERSION\
\ \\\n&& apt-get clean\n"
env:
- "PYENV_ROOT=\"/root/.pyenv\""
- "PATH=\"$PYENV_ROOT/shims:$PYENV_ROOT/bin:$PATH\""
- "PYTHON_VERSION=3.7.16"
- type: "python"
user: false
packages:
- "lxml~=4.8.0"
- "mudata~=0.2.1"
- "cellbender~=0.3.0"
upgrade: true
entrypoint: []
cmd: null
- type: "native"
id: "native"
build_info:
config: "src/correction/cellbender_remove_background/config.vsh.yaml"
runner: "executable"
engine: "docker|native"
output: "target/executable/correction/cellbender_remove_background"
executable: "target/executable/correction/cellbender_remove_background/cellbender_remove_background"
viash_version: "0.9.4"
git_commit: "173327cc5670aa8bd5cf473827de80b602c90092"
git_remote: "https://github.com/openpipelines-bio/openpipeline"
git_tag: "0.2.0-2055-g173327cc"
package_config:
name: "openpipeline"
version: "main"
summary: "Best-practice workflows for single-cell multi-omics analyses.\n"
description: "OpenPipelines are extensible single cell analysis pipelines for reproducible\
\ and large-scale single cell processing using [Viash](https://viash.io) and [Nextflow](https://www.nextflow.io/).\n\
\nIn terms of workflows, the following has been made available, but keep in mind\
\ that\nindividual tools and functionality can be executed as standalone components\
\ as well.\n\n * Demultiplexing: conversion of raw sequencing data to FASTQ objects.\n\
\ * Ingestion: Read mapping and generating a count matrix.\n * Single sample\
\ processing: cell filtering and doublet detection.\n * Multisample processing:\
\ Count transformation, normalization, QC metric calulations.\n * Integration:\
\ Clustering, integration and batch correction using single and multimodal methods.\n\
\ * Downstream analysis workflows\n"
info:
test_resources:
- type: "s3"
path: "s3://openpipelines-data"
dest: "resources_test"
nextflow_labels_ci:
- path: "src/workflows/utils/labels_ci.config"
description: "Adds the correct memory and CPU labels when running on the Viash\
\ Hub CI."
viash_version: "0.9.4"
source: "src"
target: "target"
config_mods:
- ".resources += {path: '/src/workflows/utils/labels.config', dest: 'nextflow_labels.config'}\n\
.runners[.type == 'nextflow'].config.script := 'includeConfig(\"nextflow_labels.config\"\
)'\n"
- ".engines += { type: \"native\" }"
- ".engines[.type == 'docker'].target_registry := 'images.viash-hub.com'"
- ".engines[.type == 'docker'].target_tag := 'main'"
keywords:
- "single-cell"
- "multimodal"
license: "MIT"
organization: "vsh"
links:
repository: "https://github.com/openpipelines-bio/openpipeline"
docker_registry: "ghcr.io"
homepage: "https://openpipelines.bio"
documentation: "https://openpipelines.bio/fundamentals"
issue_tracker: "https://github.com/openpipelines-bio/openpipeline/issues"

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,48 @@
process {
// Default resources for components that hardly do any processing
memory = { 2.GB * task.attempt }
cpus = 1
// Retry for exit codes that have something to do with memory issues
errorStrategy = { task.exitStatus in 137..140 ? 'retry' : 'terminate' }
maxRetries = 3
// The memory a task is assinged increases with each attempt
// uncomment the line below and adjust the value to set a global upper limit on the memory.
// resourceLimits = [ memory: 240.Gb ]
// CPU resources
withLabel: singlecpu { cpus = 1 }
withLabel: lowcpu { cpus = 4 }
withLabel: midcpu { cpus = 10 }
withLabel: highcpu { cpus = 20 }
// Memory resources
withLabel: lowmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 4.GB * task.attempt } }
withLabel: midmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 25.GB * task.attempt } }
withLabel: highmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 50.GB * task.attempt } }
withLabel: veryhighmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 75.GB * task.attempt } }
// Disk space
withLabel: lowdisk {
disk = {process.disk ? process.disk : null}
}
withLabel: middisk {
disk = {process.disk ? process.disk : null}
}
withLabel: highdisk {
disk = {process.disk ? process.disk : null}
}
withLabel: veryhighdisk {
disk = {process.disk ? process.disk : null}
}
// NOTE: The above labels intentionally do not have an effect by default.
// The user should set the disk space requirements by adding the following
// to the compute environment:
//
// withLabel: lowdisk { disk = { 20.GB * task.attempt } }
// withLabel: middisk { disk = { 100.GB * task.attempt } }
// withLabel: highdisk { disk = { 200.GB * task.attempt } }
// withLabel: veryhighdisk { disk = { 500.GB * task.attempt } }
}

View File

@@ -0,0 +1,12 @@
def setup_logger():
import logging
from sys import stdout
logger = logging.getLogger()
logger.setLevel(logging.INFO)
console_handler = logging.StreamHandler(stdout)
logFormatter = logging.Formatter("%(asctime)s %(levelname)-8s %(message)s")
console_handler.setFormatter(logFormatter)
logger.addHandler(console_handler)
return logger

Some files were not shown because too many files have changed in this diff Show More