Build branch main with version main (173327cc)

Build pipeline: vsh-ci-build-template-k4qzr

Source commit: 173327cc56

Source message: Cellranger multi conversion: fix combined AB + CB probe experiments (#1062)
This commit is contained in:
CI
2025-08-22 08:50:18 +00:00
commit cd5554d22f
2226 changed files with 1154442 additions and 0 deletions

View File

@@ -0,0 +1,322 @@
name: "harmonypy"
namespace: "integrate"
version: "main"
authors:
- name: "Dries Schaumont"
roles:
- "maintainer"
info:
role: "Core Team Member"
links:
email: "dries@data-intuitive.com"
github: "DriesSchaumont"
orcid: "0000-0002-4389-0440"
linkedin: "dries-schaumont"
organizations:
- name: "Data Intuitive"
href: "https://www.data-intuitive.com"
role: "Data Scientist"
- name: "Robrecht Cannoodt"
roles:
- "contributor"
info:
role: "Core Team Member"
links:
email: "robrecht@data-intuitive.com"
github: "rcannood"
orcid: "0000-0003-3641-729X"
linkedin: "robrechtcannoodt"
organizations:
- name: "Data Intuitive"
href: "https://www.data-intuitive.com"
role: "Data Science Engineer"
- name: "Open Problems"
href: "https://openproblems.bio"
role: "Core Member"
argument_groups:
- name: "Arguments"
arguments:
- type: "file"
name: "--input"
alternatives:
- "-i"
description: "Input h5mu file"
info: null
must_exist: true
create_parent: true
required: true
direction: "input"
multiple: false
multiple_sep: ";"
- type: "file"
name: "--output"
alternatives:
- "-o"
description: "Output h5mu file."
info: null
must_exist: true
create_parent: true
required: true
direction: "output"
multiple: false
multiple_sep: ";"
- type: "string"
name: "--modality"
description: "Which modality from the input MuData file to process.\n"
info: null
default:
- "rna"
required: false
direction: "input"
multiple: false
multiple_sep: ";"
- type: "string"
name: "--obsm_input"
description: "Which .obsm slot to use as a starting PCA embedding."
info: null
default:
- "X_pca"
required: false
direction: "input"
multiple: false
multiple_sep: ";"
- type: "string"
name: "--obsm_output"
description: "In which .obsm slot to store the resulting integrated embedding."
info: null
default:
- "X_pca_integrated"
required: false
direction: "input"
multiple: false
multiple_sep: ";"
- type: "double"
name: "--theta"
description: "Diversity clustering penalty parameter. Can be set as a single value\
\ for all batch observations or as multiple values, one for each observation\
\ in the batches defined by --obs_covariates. theta=0 does not encourage any\
\ diversity. Larger values of theta result in more diverse clusters."
info: null
default:
- 2.0
required: false
direction: "input"
multiple: true
multiple_sep: ";"
- type: "string"
name: "--obs_covariates"
description: "The .obs field(s) that define the covariate(s) to regress out."
info: null
example:
- "batch"
- "sample"
required: true
direction: "input"
multiple: true
multiple_sep: ";"
- type: "string"
name: "--output_compression"
description: "Compression format to use for the output AnnData and/or Mudata objects.\n\
By default no compression is applied.\n"
info: null
example:
- "gzip"
required: false
choices:
- "gzip"
- "lzf"
direction: "input"
multiple: false
multiple_sep: ";"
resources:
- type: "python_script"
path: "script.py"
is_executable: true
- type: "file"
path: "setup_logger.py"
- type: "file"
path: "compress_h5mu.py"
- type: "file"
path: "nextflow_labels.config"
dest: "nextflow_labels.config"
description: "Performs Harmony integration based as described in https://github.com/immunogenomics/harmony.\
\ Based on an implementation in python from https://github.com/slowkow/harmonypy"
test_resources:
- type: "python_script"
path: "test.py"
is_executable: true
- type: "file"
path: "pbmc_1k_protein_v3_mms.h5mu"
info: null
status: "enabled"
scope:
image: "public"
target: "public"
license: "MIT"
links:
repository: "https://github.com/openpipelines-bio/openpipeline"
docker_registry: "ghcr.io"
runners:
- type: "executable"
id: "executable"
docker_setup_strategy: "ifneedbepullelsecachedbuild"
- type: "nextflow"
id: "nextflow"
directives:
label:
- "highmem"
- "highcpu"
- "highdisk"
tag: "$id"
auto:
simplifyInput: true
simplifyOutput: false
transcript: false
publish: false
config:
labels:
mem1gb: "memory = 1000000000.B"
mem2gb: "memory = 2000000000.B"
mem5gb: "memory = 5000000000.B"
mem10gb: "memory = 10000000000.B"
mem20gb: "memory = 20000000000.B"
mem50gb: "memory = 50000000000.B"
mem100gb: "memory = 100000000000.B"
mem200gb: "memory = 200000000000.B"
mem500gb: "memory = 500000000000.B"
mem1tb: "memory = 1000000000000.B"
mem2tb: "memory = 2000000000000.B"
mem5tb: "memory = 5000000000000.B"
mem10tb: "memory = 10000000000000.B"
mem20tb: "memory = 20000000000000.B"
mem50tb: "memory = 50000000000000.B"
mem100tb: "memory = 100000000000000.B"
mem200tb: "memory = 200000000000000.B"
mem500tb: "memory = 500000000000000.B"
mem1gib: "memory = 1073741824.B"
mem2gib: "memory = 2147483648.B"
mem4gib: "memory = 4294967296.B"
mem8gib: "memory = 8589934592.B"
mem16gib: "memory = 17179869184.B"
mem32gib: "memory = 34359738368.B"
mem64gib: "memory = 68719476736.B"
mem128gib: "memory = 137438953472.B"
mem256gib: "memory = 274877906944.B"
mem512gib: "memory = 549755813888.B"
mem1tib: "memory = 1099511627776.B"
mem2tib: "memory = 2199023255552.B"
mem4tib: "memory = 4398046511104.B"
mem8tib: "memory = 8796093022208.B"
mem16tib: "memory = 17592186044416.B"
mem32tib: "memory = 35184372088832.B"
mem64tib: "memory = 70368744177664.B"
mem128tib: "memory = 140737488355328.B"
mem256tib: "memory = 281474976710656.B"
mem512tib: "memory = 562949953421312.B"
cpu1: "cpus = 1"
cpu2: "cpus = 2"
cpu5: "cpus = 5"
cpu10: "cpus = 10"
cpu20: "cpus = 20"
cpu50: "cpus = 50"
cpu100: "cpus = 100"
cpu200: "cpus = 200"
cpu500: "cpus = 500"
cpu1000: "cpus = 1000"
script:
- "includeConfig(\"nextflow_labels.config\")"
debug: false
container: "docker"
engines:
- type: "docker"
id: "docker"
image: "python:3.12-slim"
target_registry: "images.viash-hub.com"
target_tag: "main"
namespace_separator: "/"
setup:
- type: "apt"
packages:
- "procps"
interactive: false
- type: "python"
user: false
packages:
- "anndata~=0.11.1"
- "mudata~=0.3.1"
- "scanpy~=1.10.4"
- "harmonypy~=0.0.6"
script:
- "exec(\"try:\\n import awkward\\nexcept ModuleNotFoundError:\\n exit(0)\\\
nelse: exit(1)\")"
upgrade: true
test_setup:
- type: "apt"
packages:
- "git"
interactive: false
- type: "python"
user: false
packages:
- "viashpy==0.8.0"
github:
- "openpipelines-bio/core#subdirectory=packages/python/openpipeline_testutils"
upgrade: true
entrypoint: []
cmd: null
- type: "native"
id: "native"
build_info:
config: "src/integrate/harmonypy/config.vsh.yaml"
runner: "executable"
engine: "docker|native"
output: "target/executable/integrate/harmonypy"
executable: "target/executable/integrate/harmonypy/harmonypy"
viash_version: "0.9.4"
git_commit: "173327cc5670aa8bd5cf473827de80b602c90092"
git_remote: "https://github.com/openpipelines-bio/openpipeline"
git_tag: "0.2.0-2055-g173327cc"
package_config:
name: "openpipeline"
version: "main"
summary: "Best-practice workflows for single-cell multi-omics analyses.\n"
description: "OpenPipelines are extensible single cell analysis pipelines for reproducible\
\ and large-scale single cell processing using [Viash](https://viash.io) and [Nextflow](https://www.nextflow.io/).\n\
\nIn terms of workflows, the following has been made available, but keep in mind\
\ that\nindividual tools and functionality can be executed as standalone components\
\ as well.\n\n * Demultiplexing: conversion of raw sequencing data to FASTQ objects.\n\
\ * Ingestion: Read mapping and generating a count matrix.\n * Single sample\
\ processing: cell filtering and doublet detection.\n * Multisample processing:\
\ Count transformation, normalization, QC metric calulations.\n * Integration:\
\ Clustering, integration and batch correction using single and multimodal methods.\n\
\ * Downstream analysis workflows\n"
info:
test_resources:
- type: "s3"
path: "s3://openpipelines-data"
dest: "resources_test"
nextflow_labels_ci:
- path: "src/workflows/utils/labels_ci.config"
description: "Adds the correct memory and CPU labels when running on the Viash\
\ Hub CI."
viash_version: "0.9.4"
source: "src"
target: "target"
config_mods:
- ".resources += {path: '/src/workflows/utils/labels.config', dest: 'nextflow_labels.config'}\n\
.runners[.type == 'nextflow'].config.script := 'includeConfig(\"nextflow_labels.config\"\
)'\n"
- ".engines += { type: \"native\" }"
- ".engines[.type == 'docker'].target_registry := 'images.viash-hub.com'"
- ".engines[.type == 'docker'].target_tag := 'main'"
keywords:
- "single-cell"
- "multimodal"
license: "MIT"
organization: "vsh"
links:
repository: "https://github.com/openpipelines-bio/openpipeline"
docker_registry: "ghcr.io"
homepage: "https://openpipelines.bio"
documentation: "https://openpipelines.bio/fundamentals"
issue_tracker: "https://github.com/openpipelines-bio/openpipeline/issues"

View File

@@ -0,0 +1,87 @@
import shutil
from anndata import AnnData
from mudata import write_h5ad
from h5py import File as H5File
from h5py import Group, Dataset
from pathlib import Path
from typing import Union, Literal
from functools import partial
def compress_h5mu(
input_path: Union[str, Path],
output_path: Union[str, Path],
compression: Union[Literal["gzip"], Literal["lzf"]],
):
input_path, output_path = str(input_path), str(output_path)
def copy_attributes(in_object, out_object):
for key, value in in_object.attrs.items():
out_object.attrs[key] = value
def visit_path(
output_h5: H5File,
compression: Union[Literal["gzip"], Literal["lzf"]],
name: str,
object: Union[Group, Dataset],
):
if isinstance(object, Group):
new_group = output_h5.create_group(name)
copy_attributes(object, new_group)
elif isinstance(object, Dataset):
# Compression only works for non-scalar Dataset objects
# Scalar objects dont have a shape defined
if not object.compression and object.shape not in [None, ()]:
new_dataset = output_h5.create_dataset(
name, data=object, compression=compression
)
copy_attributes(object, new_dataset)
else:
output_h5.copy(object, name)
else:
raise NotImplementedError(
f"Could not copy element {name}, "
f"type has not been implemented yet: {type(object)}"
)
with (
H5File(input_path, "r") as input_h5,
H5File(output_path, "w", userblock_size=512) as output_h5,
):
copy_attributes(input_h5, output_h5)
input_h5.visititems(partial(visit_path, output_h5, compression))
with open(input_path, "rb") as input_bytes:
# Mudata puts metadata like this in the first 512 bytes:
# MuData (format-version=0.1.0;creator=muon;creator-version=0.2.0)
# See mudata/_core/io.py, read_h5mu() function
starting_metadata = input_bytes.read(100)
# The metadata is padded with extra null bytes up until 512 bytes
truncate_location = starting_metadata.find(b"\x00")
starting_metadata = starting_metadata[:truncate_location]
with open(output_path, "br+") as f:
nbytes = f.write(starting_metadata)
f.write(b"\0" * (512 - nbytes))
def write_h5ad_to_h5mu_with_compression(
output_file: Union[str, Path],
h5mu: Union[str, Path],
modality_name: str,
modality_data: AnnData,
output_compression=None,
):
output_file = Path(output_file)
h5mu = Path(h5mu)
output_file_uncompressed = (
output_file.with_name(output_file.stem + "_uncompressed.h5mu")
if output_compression
else output_file
)
shutil.copyfile(h5mu, output_file_uncompressed)
write_h5ad(filename=output_file_uncompressed, mod=modality_name, data=modality_data)
if output_compression:
compress_h5mu(
output_file_uncompressed, output_file, compression=output_compression
)
output_file_uncompressed.unlink()

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,48 @@
process {
// Default resources for components that hardly do any processing
memory = { 2.GB * task.attempt }
cpus = 1
// Retry for exit codes that have something to do with memory issues
errorStrategy = { task.exitStatus in 137..140 ? 'retry' : 'terminate' }
maxRetries = 3
// The memory a task is assinged increases with each attempt
// uncomment the line below and adjust the value to set a global upper limit on the memory.
// resourceLimits = [ memory: 240.Gb ]
// CPU resources
withLabel: singlecpu { cpus = 1 }
withLabel: lowcpu { cpus = 4 }
withLabel: midcpu { cpus = 10 }
withLabel: highcpu { cpus = 20 }
// Memory resources
withLabel: lowmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 4.GB * task.attempt } }
withLabel: midmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 25.GB * task.attempt } }
withLabel: highmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 50.GB * task.attempt } }
withLabel: veryhighmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 75.GB * task.attempt } }
// Disk space
withLabel: lowdisk {
disk = {process.disk ? process.disk : null}
}
withLabel: middisk {
disk = {process.disk ? process.disk : null}
}
withLabel: highdisk {
disk = {process.disk ? process.disk : null}
}
withLabel: veryhighdisk {
disk = {process.disk ? process.disk : null}
}
// NOTE: The above labels intentionally do not have an effect by default.
// The user should set the disk space requirements by adding the following
// to the compute environment:
//
// withLabel: lowdisk { disk = { 20.GB * task.attempt } }
// withLabel: middisk { disk = { 100.GB * task.attempt } }
// withLabel: highdisk { disk = { 200.GB * task.attempt } }
// withLabel: veryhighdisk { disk = { 500.GB * task.attempt } }
}

View File

@@ -0,0 +1,12 @@
def setup_logger():
import logging
from sys import stdout
logger = logging.getLogger()
logger.setLevel(logging.INFO)
console_handler = logging.StreamHandler(stdout)
logFormatter = logging.Formatter("%(asctime)s %(levelname)-8s %(message)s")
console_handler.setFormatter(logFormatter)
logger.addHandler(console_handler)
return logger

View File

@@ -0,0 +1,360 @@
name: "scanorama"
namespace: "integrate"
version: "main"
authors:
- name: "Dries De Maeyer"
roles:
- "author"
info:
role: "Core Team Member"
links:
email: "ddemaeyer@gmail.com"
github: "ddemaeyer"
linkedin: "dries-de-maeyer-b46a814"
organizations:
- name: "Janssen Pharmaceuticals"
href: "https://www.janssen.com"
role: "Principal Scientist"
- name: "Dries Schaumont"
roles:
- "maintainer"
info:
role: "Core Team Member"
links:
email: "dries@data-intuitive.com"
github: "DriesSchaumont"
orcid: "0000-0002-4389-0440"
linkedin: "dries-schaumont"
organizations:
- name: "Data Intuitive"
href: "https://www.data-intuitive.com"
role: "Data Scientist"
argument_groups:
- name: "Arguments"
arguments:
- type: "file"
name: "--input"
alternatives:
- "-i"
description: "Input h5mu file"
info: null
must_exist: true
create_parent: true
required: true
direction: "input"
multiple: false
multiple_sep: ";"
- type: "string"
name: "--modality"
description: "Which modality from the input MuData file to process.\n"
info: null
default:
- "rna"
required: false
direction: "input"
multiple: false
multiple_sep: ";"
- type: "file"
name: "--output"
alternatives:
- "-o"
description: "Output .h5mu file"
info: null
default:
- "output.h5ad"
must_exist: true
create_parent: true
required: true
direction: "output"
multiple: false
multiple_sep: ";"
- type: "string"
name: "--obs_batch"
description: "Column name discriminating between your batches."
info: null
default:
- "batch"
required: false
direction: "input"
multiple: false
multiple_sep: ";"
- type: "string"
name: "--obsm_input"
description: "Basis obsm slot to run scanorama on."
info: null
default:
- "X_pca"
required: false
direction: "input"
multiple: false
multiple_sep: ";"
- type: "string"
name: "--obsm_output"
description: "The name of the field in adata.obsm where the integrated embeddings\
\ will be stored after running this function. Defaults to X_scanorama."
info: null
default:
- "X_scanorama"
required: false
direction: "input"
multiple: false
multiple_sep: ";"
- type: "integer"
name: "--knn"
description: "Number of nearest neighbors to use for matching."
info: null
default:
- 20
required: false
direction: "input"
multiple: false
multiple_sep: ";"
- type: "integer"
name: "--batch_size"
description: "The batch size used in the alignment vector computation. Useful\
\ when integrating very large (>100k samples) datasets. Set to large value that\
\ runs within available memory."
info: null
default:
- 5000
required: false
direction: "input"
multiple: false
multiple_sep: ";"
- type: "double"
name: "--sigma"
description: "Correction smoothing parameter on Gaussian kernel."
info: null
default:
- 15.0
required: false
direction: "input"
multiple: false
multiple_sep: ";"
- type: "boolean"
name: "--approx"
description: "Use approximate nearest neighbors with Python annoy; greatly speeds\
\ up matching runtime."
info: null
default:
- true
required: false
direction: "input"
multiple: false
multiple_sep: ";"
- type: "double"
name: "--alpha"
description: "Alignment score minimum cutoff"
info: null
default:
- 0.1
required: false
direction: "input"
multiple: false
multiple_sep: ";"
- type: "string"
name: "--output_compression"
description: "Compression format to use for the output AnnData and/or Mudata objects.\n\
By default no compression is applied.\n"
info: null
example:
- "gzip"
required: false
choices:
- "gzip"
- "lzf"
direction: "input"
multiple: false
multiple_sep: ";"
resources:
- type: "python_script"
path: "script.py"
is_executable: true
- type: "file"
path: "setup_logger.py"
- type: "file"
path: "compress_h5mu.py"
- type: "file"
path: "nextflow_labels.config"
dest: "nextflow_labels.config"
description: "Use Scanorama to integrate different experiments.\n"
test_resources:
- type: "python_script"
path: "test.py"
is_executable: true
- type: "file"
path: "pbmc_1k_protein_v3_mms.h5mu"
info: null
status: "enabled"
scope:
image: "public"
target: "public"
license: "MIT"
links:
repository: "https://github.com/openpipelines-bio/openpipeline"
docker_registry: "ghcr.io"
runners:
- type: "executable"
id: "executable"
docker_setup_strategy: "ifneedbepullelsecachedbuild"
- type: "nextflow"
id: "nextflow"
directives:
label:
- "midcpu"
- "highmem"
- "highdisk"
tag: "$id"
auto:
simplifyInput: true
simplifyOutput: false
transcript: false
publish: false
config:
labels:
mem1gb: "memory = 1000000000.B"
mem2gb: "memory = 2000000000.B"
mem5gb: "memory = 5000000000.B"
mem10gb: "memory = 10000000000.B"
mem20gb: "memory = 20000000000.B"
mem50gb: "memory = 50000000000.B"
mem100gb: "memory = 100000000000.B"
mem200gb: "memory = 200000000000.B"
mem500gb: "memory = 500000000000.B"
mem1tb: "memory = 1000000000000.B"
mem2tb: "memory = 2000000000000.B"
mem5tb: "memory = 5000000000000.B"
mem10tb: "memory = 10000000000000.B"
mem20tb: "memory = 20000000000000.B"
mem50tb: "memory = 50000000000000.B"
mem100tb: "memory = 100000000000000.B"
mem200tb: "memory = 200000000000000.B"
mem500tb: "memory = 500000000000000.B"
mem1gib: "memory = 1073741824.B"
mem2gib: "memory = 2147483648.B"
mem4gib: "memory = 4294967296.B"
mem8gib: "memory = 8589934592.B"
mem16gib: "memory = 17179869184.B"
mem32gib: "memory = 34359738368.B"
mem64gib: "memory = 68719476736.B"
mem128gib: "memory = 137438953472.B"
mem256gib: "memory = 274877906944.B"
mem512gib: "memory = 549755813888.B"
mem1tib: "memory = 1099511627776.B"
mem2tib: "memory = 2199023255552.B"
mem4tib: "memory = 4398046511104.B"
mem8tib: "memory = 8796093022208.B"
mem16tib: "memory = 17592186044416.B"
mem32tib: "memory = 35184372088832.B"
mem64tib: "memory = 70368744177664.B"
mem128tib: "memory = 140737488355328.B"
mem256tib: "memory = 281474976710656.B"
mem512tib: "memory = 562949953421312.B"
cpu1: "cpus = 1"
cpu2: "cpus = 2"
cpu5: "cpus = 5"
cpu10: "cpus = 10"
cpu20: "cpus = 20"
cpu50: "cpus = 50"
cpu100: "cpus = 100"
cpu200: "cpus = 200"
cpu500: "cpus = 500"
cpu1000: "cpus = 1000"
script:
- "includeConfig(\"nextflow_labels.config\")"
debug: false
container: "docker"
engines:
- type: "docker"
id: "docker"
image: "python:3.12-slim"
target_registry: "images.viash-hub.com"
target_tag: "main"
namespace_separator: "/"
setup:
- type: "apt"
packages:
- "procps"
- "build-essential"
interactive: false
- type: "python"
user: false
packages:
- "anndata~=0.11.1"
- "mudata~=0.3.1"
- "scanpy~=1.10.4"
- "scanorama"
script:
- "exec(\"try:\\n import awkward\\nexcept ModuleNotFoundError:\\n exit(0)\\\
nelse: exit(1)\")"
upgrade: true
test_setup:
- type: "apt"
packages:
- "git"
interactive: false
- type: "python"
user: false
packages:
- "viashpy==0.8.0"
github:
- "openpipelines-bio/core#subdirectory=packages/python/openpipeline_testutils"
upgrade: true
entrypoint: []
cmd: null
- type: "native"
id: "native"
build_info:
config: "src/integrate/scanorama/config.vsh.yaml"
runner: "executable"
engine: "docker|native"
output: "target/executable/integrate/scanorama"
executable: "target/executable/integrate/scanorama/scanorama"
viash_version: "0.9.4"
git_commit: "173327cc5670aa8bd5cf473827de80b602c90092"
git_remote: "https://github.com/openpipelines-bio/openpipeline"
git_tag: "0.2.0-2055-g173327cc"
package_config:
name: "openpipeline"
version: "main"
summary: "Best-practice workflows for single-cell multi-omics analyses.\n"
description: "OpenPipelines are extensible single cell analysis pipelines for reproducible\
\ and large-scale single cell processing using [Viash](https://viash.io) and [Nextflow](https://www.nextflow.io/).\n\
\nIn terms of workflows, the following has been made available, but keep in mind\
\ that\nindividual tools and functionality can be executed as standalone components\
\ as well.\n\n * Demultiplexing: conversion of raw sequencing data to FASTQ objects.\n\
\ * Ingestion: Read mapping and generating a count matrix.\n * Single sample\
\ processing: cell filtering and doublet detection.\n * Multisample processing:\
\ Count transformation, normalization, QC metric calulations.\n * Integration:\
\ Clustering, integration and batch correction using single and multimodal methods.\n\
\ * Downstream analysis workflows\n"
info:
test_resources:
- type: "s3"
path: "s3://openpipelines-data"
dest: "resources_test"
nextflow_labels_ci:
- path: "src/workflows/utils/labels_ci.config"
description: "Adds the correct memory and CPU labels when running on the Viash\
\ Hub CI."
viash_version: "0.9.4"
source: "src"
target: "target"
config_mods:
- ".resources += {path: '/src/workflows/utils/labels.config', dest: 'nextflow_labels.config'}\n\
.runners[.type == 'nextflow'].config.script := 'includeConfig(\"nextflow_labels.config\"\
)'\n"
- ".engines += { type: \"native\" }"
- ".engines[.type == 'docker'].target_registry := 'images.viash-hub.com'"
- ".engines[.type == 'docker'].target_tag := 'main'"
keywords:
- "single-cell"
- "multimodal"
license: "MIT"
organization: "vsh"
links:
repository: "https://github.com/openpipelines-bio/openpipeline"
docker_registry: "ghcr.io"
homepage: "https://openpipelines.bio"
documentation: "https://openpipelines.bio/fundamentals"
issue_tracker: "https://github.com/openpipelines-bio/openpipeline/issues"

View File

@@ -0,0 +1,87 @@
import shutil
from anndata import AnnData
from mudata import write_h5ad
from h5py import File as H5File
from h5py import Group, Dataset
from pathlib import Path
from typing import Union, Literal
from functools import partial
def compress_h5mu(
input_path: Union[str, Path],
output_path: Union[str, Path],
compression: Union[Literal["gzip"], Literal["lzf"]],
):
input_path, output_path = str(input_path), str(output_path)
def copy_attributes(in_object, out_object):
for key, value in in_object.attrs.items():
out_object.attrs[key] = value
def visit_path(
output_h5: H5File,
compression: Union[Literal["gzip"], Literal["lzf"]],
name: str,
object: Union[Group, Dataset],
):
if isinstance(object, Group):
new_group = output_h5.create_group(name)
copy_attributes(object, new_group)
elif isinstance(object, Dataset):
# Compression only works for non-scalar Dataset objects
# Scalar objects dont have a shape defined
if not object.compression and object.shape not in [None, ()]:
new_dataset = output_h5.create_dataset(
name, data=object, compression=compression
)
copy_attributes(object, new_dataset)
else:
output_h5.copy(object, name)
else:
raise NotImplementedError(
f"Could not copy element {name}, "
f"type has not been implemented yet: {type(object)}"
)
with (
H5File(input_path, "r") as input_h5,
H5File(output_path, "w", userblock_size=512) as output_h5,
):
copy_attributes(input_h5, output_h5)
input_h5.visititems(partial(visit_path, output_h5, compression))
with open(input_path, "rb") as input_bytes:
# Mudata puts metadata like this in the first 512 bytes:
# MuData (format-version=0.1.0;creator=muon;creator-version=0.2.0)
# See mudata/_core/io.py, read_h5mu() function
starting_metadata = input_bytes.read(100)
# The metadata is padded with extra null bytes up until 512 bytes
truncate_location = starting_metadata.find(b"\x00")
starting_metadata = starting_metadata[:truncate_location]
with open(output_path, "br+") as f:
nbytes = f.write(starting_metadata)
f.write(b"\0" * (512 - nbytes))
def write_h5ad_to_h5mu_with_compression(
output_file: Union[str, Path],
h5mu: Union[str, Path],
modality_name: str,
modality_data: AnnData,
output_compression=None,
):
output_file = Path(output_file)
h5mu = Path(h5mu)
output_file_uncompressed = (
output_file.with_name(output_file.stem + "_uncompressed.h5mu")
if output_compression
else output_file
)
shutil.copyfile(h5mu, output_file_uncompressed)
write_h5ad(filename=output_file_uncompressed, mod=modality_name, data=modality_data)
if output_compression:
compress_h5mu(
output_file_uncompressed, output_file, compression=output_compression
)
output_file_uncompressed.unlink()

View File

@@ -0,0 +1,48 @@
process {
// Default resources for components that hardly do any processing
memory = { 2.GB * task.attempt }
cpus = 1
// Retry for exit codes that have something to do with memory issues
errorStrategy = { task.exitStatus in 137..140 ? 'retry' : 'terminate' }
maxRetries = 3
// The memory a task is assinged increases with each attempt
// uncomment the line below and adjust the value to set a global upper limit on the memory.
// resourceLimits = [ memory: 240.Gb ]
// CPU resources
withLabel: singlecpu { cpus = 1 }
withLabel: lowcpu { cpus = 4 }
withLabel: midcpu { cpus = 10 }
withLabel: highcpu { cpus = 20 }
// Memory resources
withLabel: lowmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 4.GB * task.attempt } }
withLabel: midmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 25.GB * task.attempt } }
withLabel: highmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 50.GB * task.attempt } }
withLabel: veryhighmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 75.GB * task.attempt } }
// Disk space
withLabel: lowdisk {
disk = {process.disk ? process.disk : null}
}
withLabel: middisk {
disk = {process.disk ? process.disk : null}
}
withLabel: highdisk {
disk = {process.disk ? process.disk : null}
}
withLabel: veryhighdisk {
disk = {process.disk ? process.disk : null}
}
// NOTE: The above labels intentionally do not have an effect by default.
// The user should set the disk space requirements by adding the following
// to the compute environment:
//
// withLabel: lowdisk { disk = { 20.GB * task.attempt } }
// withLabel: middisk { disk = { 100.GB * task.attempt } }
// withLabel: highdisk { disk = { 200.GB * task.attempt } }
// withLabel: veryhighdisk { disk = { 500.GB * task.attempt } }
}

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,12 @@
def setup_logger():
import logging
from sys import stdout
logger = logging.getLogger()
logger.setLevel(logging.INFO)
console_handler = logging.StreamHandler(stdout)
logFormatter = logging.Formatter("%(asctime)s %(levelname)-8s %(message)s")
console_handler.setFormatter(logFormatter)
logger.addHandler(console_handler)
return logger

View File

@@ -0,0 +1,482 @@
name: "scarches"
namespace: "integrate"
version: "main"
authors:
- name: "Vladimir Shitov"
roles:
- "author"
info:
role: "Contributor"
links:
email: "vladimir.shitov@helmholtz-muenchen.de"
github: "vladimirshitov"
orcid: "0000-0002-1960-8812"
linkedin: "vladimir-shitov-9a659513b"
organizations:
- name: "Helmholtz Munich"
href: "https://www.helmholtz-munich.de"
role: "PhD Candidate"
- name: "Dorien Roosen"
roles:
- "maintainer"
info:
role: "Core Team Member"
links:
email: "dorien@data-intuitive.com"
github: "dorien-er"
linkedin: "dorien-roosen"
organizations:
- name: "Data Intuitive"
href: "https://www.data-intuitive.com"
role: "Data Scientist"
argument_groups:
- name: "Inputs"
description: "Arguments related to the input (query) dataset"
arguments:
- type: "file"
name: "--input"
alternatives:
- "-i"
description: "Input h5mu file to use as a query"
info: null
must_exist: true
create_parent: true
required: true
direction: "input"
multiple: false
multiple_sep: ";"
- type: "string"
name: "--layer"
description: "Layer to be used for scArches, if .X is not to be used."
info: null
required: false
direction: "input"
multiple: false
multiple_sep: ";"
- type: "string"
name: "--modality"
description: "Which modality from the input MuData file to process.\n"
info: null
default:
- "rna"
required: false
direction: "input"
multiple: false
multiple_sep: ";"
- type: "string"
name: "--input_obs_batch"
description: "Name of the .obs column with batch information."
info: null
required: false
direction: "input"
multiple: false
multiple_sep: ";"
- type: "string"
name: "--input_obs_label"
description: "Name of the .obs column with celltype information."
info: null
required: false
direction: "input"
multiple: false
multiple_sep: ";"
- type: "string"
name: "--input_var_gene_names"
description: "Name of the .var column with gene names, if the var .index is not\
\ to be used."
info: null
required: false
direction: "input"
multiple: false
multiple_sep: ";"
- type: "string"
name: "--input_obs_size_factor"
description: "Key in adata.obs for size factor information. Instead of using library\
\ size as a size factor,\nthe provided size factor column will be used as offset\
\ in the mean of the likelihood.\nAssumed to be on linear scale.\n"
info: null
required: false
direction: "input"
multiple: false
multiple_sep: ";"
- name: "Reference"
arguments:
- type: "file"
name: "--reference"
alternatives:
- "-r"
description: "Path to the directory with reference model or a web link."
info: null
must_exist: true
create_parent: true
required: true
direction: "input"
multiple: false
multiple_sep: ";"
- type: "string"
name: "--reference_class"
description: "For legacy models; the type of model (where the type of model was\
\ not saved with it; e.g. when they were generated with scvi-tools versions\
\ < 0.15)."
info: null
example:
- "SCANVI"
required: false
direction: "input"
multiple: false
multiple_sep: ";"
- name: "Outputs"
arguments:
- type: "file"
name: "--output"
alternatives:
- "-o"
description: "Output h5mu file."
info: null
must_exist: true
create_parent: true
required: true
direction: "output"
multiple: false
multiple_sep: ";"
- type: "file"
name: "--model_output"
description: "Output directory for model"
info: null
default:
- "model"
must_exist: true
create_parent: true
required: false
direction: "output"
multiple: false
multiple_sep: ";"
- type: "string"
name: "--obsm_output"
description: "In which .obsm slot to store the resulting integrated embedding."
info: null
default:
- "X_integrated_scanvi"
required: false
direction: "input"
multiple: false
multiple_sep: ";"
- type: "string"
name: "--obs_output_predictions"
description: "In which .obs slot to store the resulting label predictions. Only\
\ relevant if a scANVI model was provided."
info: null
default:
- "scanvi_pred"
required: false
direction: "input"
multiple: false
multiple_sep: ";"
- type: "string"
name: "--obs_output_probabilities"
description: "In which .obs slot to store the probabilities of the label predictions.\
\ Only relevant if a scANVI model was provided."
info: null
default:
- "scanvi_proba"
required: false
direction: "input"
multiple: false
multiple_sep: ";"
- type: "string"
name: "--output_compression"
description: "Compression format to use for the output AnnData and/or Mudata objects.\n\
By default no compression is applied.\n"
info: null
example:
- "gzip"
required: false
choices:
- "gzip"
- "lzf"
direction: "input"
multiple: false
multiple_sep: ";"
- name: "Early stopping arguments"
arguments:
- type: "boolean"
name: "--early_stopping"
description: "Whether to perform early stopping with respect to the validation\
\ set."
info: null
required: false
direction: "input"
multiple: false
multiple_sep: ";"
- type: "string"
name: "--early_stopping_monitor"
description: "Metric logged during validation set epoch."
info: null
default:
- "elbo_validation"
required: false
choices:
- "elbo_validation"
- "reconstruction_loss_validation"
- "kl_local_validation"
direction: "input"
multiple: false
multiple_sep: ";"
- type: "integer"
name: "--early_stopping_patience"
description: "Number of validation epochs with no improvement after which training\
\ will be stopped."
info: null
default:
- 45
required: false
min: 1
direction: "input"
multiple: false
multiple_sep: ";"
- type: "double"
name: "--early_stopping_min_delta"
description: "Minimum change in the monitored quantity to qualify as an improvement,\
\ i.e. an absolute change of less than min_delta, will count as no improvement."
info: null
default:
- 0.0
required: false
min: 0.0
direction: "input"
multiple: false
multiple_sep: ";"
- name: "Learning parameters"
arguments:
- type: "integer"
name: "--max_epochs"
description: "Number of passes through the dataset, defaults to (20000 / number\
\ of cells) * 400 or 400; whichever is smallest."
info: null
required: false
direction: "input"
multiple: false
multiple_sep: ";"
- type: "boolean"
name: "--reduce_lr_on_plateau"
description: "Whether to monitor validation loss and reduce learning rate when\
\ validation set `lr_scheduler_metric` plateaus."
info: null
default:
- true
required: false
direction: "input"
multiple: false
multiple_sep: ";"
- type: "double"
name: "--lr_factor"
description: "Factor to reduce learning rate."
info: null
default:
- 0.6
required: false
min: 0.0
direction: "input"
multiple: false
multiple_sep: ";"
- type: "double"
name: "--lr_patience"
description: "Number of epochs with no improvement after which learning rate will\
\ be reduced."
info: null
default:
- 30.0
required: false
min: 0.0
direction: "input"
multiple: false
multiple_sep: ";"
resources:
- type: "python_script"
path: "script.py"
is_executable: true
- type: "file"
path: "setup_logger.py"
- type: "file"
path: "compress_h5mu.py"
- type: "file"
path: "set_var_index.py"
- type: "file"
path: "nextflow_labels.config"
dest: "nextflow_labels.config"
description: "Performs reference mapping with scArches"
test_resources:
- type: "python_script"
path: "test.py"
is_executable: true
- type: "file"
path: "scanvi_model"
- type: "file"
path: "scvi_model"
- type: "file"
path: "pbmc_1k_protein_v3_mms.h5mu"
- type: "file"
path: "HLCA_reference_model.zip"
info: null
status: "enabled"
scope:
image: "public"
target: "public"
license: "MIT"
links:
repository: "https://github.com/openpipelines-bio/openpipeline"
docker_registry: "ghcr.io"
runners:
- type: "executable"
id: "executable"
docker_setup_strategy: "ifneedbepullelsecachedbuild"
- type: "nextflow"
id: "nextflow"
directives:
label:
- "highmem"
- "highcpu"
- "highdisk"
- "gpu"
tag: "$id"
auto:
simplifyInput: true
simplifyOutput: false
transcript: false
publish: false
config:
labels:
mem1gb: "memory = 1000000000.B"
mem2gb: "memory = 2000000000.B"
mem5gb: "memory = 5000000000.B"
mem10gb: "memory = 10000000000.B"
mem20gb: "memory = 20000000000.B"
mem50gb: "memory = 50000000000.B"
mem100gb: "memory = 100000000000.B"
mem200gb: "memory = 200000000000.B"
mem500gb: "memory = 500000000000.B"
mem1tb: "memory = 1000000000000.B"
mem2tb: "memory = 2000000000000.B"
mem5tb: "memory = 5000000000000.B"
mem10tb: "memory = 10000000000000.B"
mem20tb: "memory = 20000000000000.B"
mem50tb: "memory = 50000000000000.B"
mem100tb: "memory = 100000000000000.B"
mem200tb: "memory = 200000000000000.B"
mem500tb: "memory = 500000000000000.B"
mem1gib: "memory = 1073741824.B"
mem2gib: "memory = 2147483648.B"
mem4gib: "memory = 4294967296.B"
mem8gib: "memory = 8589934592.B"
mem16gib: "memory = 17179869184.B"
mem32gib: "memory = 34359738368.B"
mem64gib: "memory = 68719476736.B"
mem128gib: "memory = 137438953472.B"
mem256gib: "memory = 274877906944.B"
mem512gib: "memory = 549755813888.B"
mem1tib: "memory = 1099511627776.B"
mem2tib: "memory = 2199023255552.B"
mem4tib: "memory = 4398046511104.B"
mem8tib: "memory = 8796093022208.B"
mem16tib: "memory = 17592186044416.B"
mem32tib: "memory = 35184372088832.B"
mem64tib: "memory = 70368744177664.B"
mem128tib: "memory = 140737488355328.B"
mem256tib: "memory = 281474976710656.B"
mem512tib: "memory = 562949953421312.B"
cpu1: "cpus = 1"
cpu2: "cpus = 2"
cpu5: "cpus = 5"
cpu10: "cpus = 10"
cpu20: "cpus = 20"
cpu50: "cpus = 50"
cpu100: "cpus = 100"
cpu200: "cpus = 200"
cpu500: "cpus = 500"
cpu1000: "cpus = 1000"
script:
- "includeConfig(\"nextflow_labels.config\")"
debug: false
container: "docker"
engines:
- type: "docker"
id: "docker"
image: "nvcr.io/nvidia/pytorch:25.05-py3"
target_registry: "images.viash-hub.com"
target_tag: "main"
namespace_separator: "/"
setup:
- type: "python"
user: false
packages:
- "anndata~=0.11.1"
- "mudata~=0.3.1"
- "jax[cuda]"
- "scvi-tools~=1.3.1"
script:
- "exec(\"try:\\n import awkward\\nexcept ModuleNotFoundError:\\n exit(0)\\\
nelse: exit(1)\")"
upgrade: true
test_setup:
- type: "python"
user: false
packages:
- "viashpy==0.8.0"
upgrade: true
entrypoint: []
cmd: null
- type: "native"
id: "native"
build_info:
config: "src/integrate/scarches/config.vsh.yaml"
runner: "executable"
engine: "docker|native"
output: "target/executable/integrate/scarches"
executable: "target/executable/integrate/scarches/scarches"
viash_version: "0.9.4"
git_commit: "173327cc5670aa8bd5cf473827de80b602c90092"
git_remote: "https://github.com/openpipelines-bio/openpipeline"
git_tag: "0.2.0-2055-g173327cc"
package_config:
name: "openpipeline"
version: "main"
summary: "Best-practice workflows for single-cell multi-omics analyses.\n"
description: "OpenPipelines are extensible single cell analysis pipelines for reproducible\
\ and large-scale single cell processing using [Viash](https://viash.io) and [Nextflow](https://www.nextflow.io/).\n\
\nIn terms of workflows, the following has been made available, but keep in mind\
\ that\nindividual tools and functionality can be executed as standalone components\
\ as well.\n\n * Demultiplexing: conversion of raw sequencing data to FASTQ objects.\n\
\ * Ingestion: Read mapping and generating a count matrix.\n * Single sample\
\ processing: cell filtering and doublet detection.\n * Multisample processing:\
\ Count transformation, normalization, QC metric calulations.\n * Integration:\
\ Clustering, integration and batch correction using single and multimodal methods.\n\
\ * Downstream analysis workflows\n"
info:
test_resources:
- type: "s3"
path: "s3://openpipelines-data"
dest: "resources_test"
nextflow_labels_ci:
- path: "src/workflows/utils/labels_ci.config"
description: "Adds the correct memory and CPU labels when running on the Viash\
\ Hub CI."
viash_version: "0.9.4"
source: "src"
target: "target"
config_mods:
- ".resources += {path: '/src/workflows/utils/labels.config', dest: 'nextflow_labels.config'}\n\
.runners[.type == 'nextflow'].config.script := 'includeConfig(\"nextflow_labels.config\"\
)'\n"
- ".engines += { type: \"native\" }"
- ".engines[.type == 'docker'].target_registry := 'images.viash-hub.com'"
- ".engines[.type == 'docker'].target_tag := 'main'"
keywords:
- "single-cell"
- "multimodal"
license: "MIT"
organization: "vsh"
links:
repository: "https://github.com/openpipelines-bio/openpipeline"
docker_registry: "ghcr.io"
homepage: "https://openpipelines.bio"
documentation: "https://openpipelines.bio/fundamentals"
issue_tracker: "https://github.com/openpipelines-bio/openpipeline/issues"

View File

@@ -0,0 +1,87 @@
import shutil
from anndata import AnnData
from mudata import write_h5ad
from h5py import File as H5File
from h5py import Group, Dataset
from pathlib import Path
from typing import Union, Literal
from functools import partial
def compress_h5mu(
input_path: Union[str, Path],
output_path: Union[str, Path],
compression: Union[Literal["gzip"], Literal["lzf"]],
):
input_path, output_path = str(input_path), str(output_path)
def copy_attributes(in_object, out_object):
for key, value in in_object.attrs.items():
out_object.attrs[key] = value
def visit_path(
output_h5: H5File,
compression: Union[Literal["gzip"], Literal["lzf"]],
name: str,
object: Union[Group, Dataset],
):
if isinstance(object, Group):
new_group = output_h5.create_group(name)
copy_attributes(object, new_group)
elif isinstance(object, Dataset):
# Compression only works for non-scalar Dataset objects
# Scalar objects dont have a shape defined
if not object.compression and object.shape not in [None, ()]:
new_dataset = output_h5.create_dataset(
name, data=object, compression=compression
)
copy_attributes(object, new_dataset)
else:
output_h5.copy(object, name)
else:
raise NotImplementedError(
f"Could not copy element {name}, "
f"type has not been implemented yet: {type(object)}"
)
with (
H5File(input_path, "r") as input_h5,
H5File(output_path, "w", userblock_size=512) as output_h5,
):
copy_attributes(input_h5, output_h5)
input_h5.visititems(partial(visit_path, output_h5, compression))
with open(input_path, "rb") as input_bytes:
# Mudata puts metadata like this in the first 512 bytes:
# MuData (format-version=0.1.0;creator=muon;creator-version=0.2.0)
# See mudata/_core/io.py, read_h5mu() function
starting_metadata = input_bytes.read(100)
# The metadata is padded with extra null bytes up until 512 bytes
truncate_location = starting_metadata.find(b"\x00")
starting_metadata = starting_metadata[:truncate_location]
with open(output_path, "br+") as f:
nbytes = f.write(starting_metadata)
f.write(b"\0" * (512 - nbytes))
def write_h5ad_to_h5mu_with_compression(
output_file: Union[str, Path],
h5mu: Union[str, Path],
modality_name: str,
modality_data: AnnData,
output_compression=None,
):
output_file = Path(output_file)
h5mu = Path(h5mu)
output_file_uncompressed = (
output_file.with_name(output_file.stem + "_uncompressed.h5mu")
if output_compression
else output_file
)
shutil.copyfile(h5mu, output_file_uncompressed)
write_h5ad(filename=output_file_uncompressed, mod=modality_name, data=modality_data)
if output_compression:
compress_h5mu(
output_file_uncompressed, output_file, compression=output_compression
)
output_file_uncompressed.unlink()

View File

@@ -0,0 +1,48 @@
process {
// Default resources for components that hardly do any processing
memory = { 2.GB * task.attempt }
cpus = 1
// Retry for exit codes that have something to do with memory issues
errorStrategy = { task.exitStatus in 137..140 ? 'retry' : 'terminate' }
maxRetries = 3
// The memory a task is assinged increases with each attempt
// uncomment the line below and adjust the value to set a global upper limit on the memory.
// resourceLimits = [ memory: 240.Gb ]
// CPU resources
withLabel: singlecpu { cpus = 1 }
withLabel: lowcpu { cpus = 4 }
withLabel: midcpu { cpus = 10 }
withLabel: highcpu { cpus = 20 }
// Memory resources
withLabel: lowmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 4.GB * task.attempt } }
withLabel: midmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 25.GB * task.attempt } }
withLabel: highmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 50.GB * task.attempt } }
withLabel: veryhighmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 75.GB * task.attempt } }
// Disk space
withLabel: lowdisk {
disk = {process.disk ? process.disk : null}
}
withLabel: middisk {
disk = {process.disk ? process.disk : null}
}
withLabel: highdisk {
disk = {process.disk ? process.disk : null}
}
withLabel: veryhighdisk {
disk = {process.disk ? process.disk : null}
}
// NOTE: The above labels intentionally do not have an effect by default.
// The user should set the disk space requirements by adding the following
// to the compute environment:
//
// withLabel: lowdisk { disk = { 20.GB * task.attempt } }
// withLabel: middisk { disk = { 100.GB * task.attempt } }
// withLabel: highdisk { disk = { 200.GB * task.attempt } }
// withLabel: veryhighdisk { disk = { 500.GB * task.attempt } }
}

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,24 @@
import anndata as ad
import re
def set_var_index(adata: ad.AnnData, var_name: str | None = None) -> ad.AnnData:
"""Sanitize gene names and set the index of the .var DataFrame.
Parameters
----------
adata : AnnData
Annotated data object
var_name : str | None
Name of the column in `adata.var` that contains the gene names, if None, the existing index will be sanitized but not replaced.
Returns
-------
AnnData
Copy of `adata` with sanitized and replaced index
"""
if var_name:
adata.var.index = [re.sub("\\.[0-9]+$", "", s) for s in adata.var[var_name]]
else:
adata.var.index = [re.sub("\\.[0-9]+$", "", s) for s in adata.var.index]
return adata

View File

@@ -0,0 +1,12 @@
def setup_logger():
import logging
from sys import stdout
logger = logging.getLogger()
logger.setLevel(logging.INFO)
console_handler = logging.StreamHandler(stdout)
logFormatter = logging.Formatter("%(asctime)s %(levelname)-8s %(message)s")
console_handler.setFormatter(logFormatter)
logger.addHandler(console_handler)
return logger

View File

@@ -0,0 +1,638 @@
name: "scvi"
namespace: "integrate"
version: "main"
authors:
- name: "Malte D. Luecken"
roles:
- "author"
info:
role: "Core Team Member"
links:
email: "malte.luecken@helmholtz-muenchen.de"
github: "LuckyMD"
orcid: "0000-0001-7464-7921"
linkedin: "malte-l%C3%BCcken-b8b21049"
twitter: "MDLuecken"
organizations:
- name: "Helmholtz Munich"
href: "https://www.helmholtz-munich.de"
role: "Group Leader"
- name: "Open Problems"
href: "https://openproblems.bio"
role: "Core Member"
- name: "Dries Schaumont"
roles:
- "maintainer"
info:
role: "Core Team Member"
links:
email: "dries@data-intuitive.com"
github: "DriesSchaumont"
orcid: "0000-0002-4389-0440"
linkedin: "dries-schaumont"
organizations:
- name: "Data Intuitive"
href: "https://www.data-intuitive.com"
role: "Data Scientist"
- name: "Matthias Beyens"
roles:
- "contributor"
info:
role: "Contributor"
links:
github: "MatthiasBeyens"
orcid: "0000-0003-3304-0706"
email: "matthias.beyens@gmail.com"
linkedin: "mbeyens"
organizations:
- name: "Janssen Pharmaceuticals"
href: "https://www.janssen.com"
role: "Principal Scientist"
argument_groups:
- name: "Inputs"
arguments:
- type: "file"
name: "--input"
alternatives:
- "-i"
description: "Input h5mu file"
info: null
must_exist: true
create_parent: true
required: true
direction: "input"
multiple: false
multiple_sep: ";"
- type: "string"
name: "--modality"
description: "Which modality from the input MuData file to process.\n"
info: null
default:
- "rna"
required: false
direction: "input"
multiple: false
multiple_sep: ";"
- type: "string"
name: "--input_layer"
description: "Input layer to use. If None, X is used"
info: null
required: false
direction: "input"
multiple: false
multiple_sep: ";"
- type: "string"
name: "--obs_batch"
description: "Column name discriminating between your batches."
info: null
default:
- "sample_id"
required: false
direction: "input"
multiple: false
multiple_sep: ";"
- type: "string"
name: "--var_gene_names"
description: ".var column containing gene names. By default, use the index."
info: null
required: false
direction: "input"
multiple: false
multiple_sep: ";"
- type: "string"
name: "--var_input"
description: ".var column containing highly variable genes. By default, do not\
\ subset genes."
info: null
required: false
direction: "input"
multiple: false
multiple_sep: ";"
- type: "string"
name: "--obs_labels"
description: "Key in adata.obs for label information. Categories will automatically\
\ be \nconverted into integer categories and saved to adata.obs['_scvi_labels'].\n\
If None, assigns the same label to all the data.\n"
info: null
required: false
direction: "input"
multiple: false
multiple_sep: ";"
- type: "string"
name: "--obs_size_factor"
description: "Key in adata.obs for size factor information. Instead of using library\
\ size as a size factor,\nthe provided size factor column will be used as offset\
\ in the mean of the likelihood.\nAssumed to be on linear scale.\n"
info: null
required: false
direction: "input"
multiple: false
multiple_sep: ";"
- type: "string"
name: "--obs_categorical_covariate"
description: "Keys in adata.obs that correspond to categorical data. These covariates\
\ can be added in\naddition to the batch covariate and are also treated as nuisance\
\ factors\n(i.e., the model tries to minimize their effects on the latent space).\n\
Thus, these should not be used for biologically-relevant factors that you do\
\ _not_ want to correct for.\n"
info: null
required: false
direction: "input"
multiple: true
multiple_sep: ";"
- type: "string"
name: "--obs_continuous_covariate"
description: "Keys in adata.obs that correspond to continuous data. These covariates\
\ can be added in\naddition to the batch covariate and are also treated as nuisance\
\ factors\n(i.e., the model tries to minimize their effects on the latent space).\
\ Thus, these should not be\nused for biologically-relevant factors that you\
\ do _not_ want to correct for.\n"
info: null
required: false
direction: "input"
multiple: true
multiple_sep: ";"
- name: "Outputs"
arguments:
- type: "file"
name: "--output"
alternatives:
- "-o"
description: "Output h5mu file."
info: null
must_exist: true
create_parent: true
required: true
direction: "output"
multiple: false
multiple_sep: ";"
- type: "file"
name: "--output_model"
description: "Folder where the state of the trained model will be saved to."
info: null
must_exist: true
create_parent: true
required: false
direction: "output"
multiple: false
multiple_sep: ";"
- type: "string"
name: "--obsm_output"
description: "In which .obsm slot to store the resulting integrated embedding."
info: null
default:
- "X_scvi_integrated"
required: false
direction: "input"
multiple: false
multiple_sep: ";"
- type: "string"
name: "--output_compression"
description: "Compression format to use for the output AnnData and/or Mudata objects.\n\
By default no compression is applied.\n"
info: null
example:
- "gzip"
required: false
choices:
- "gzip"
- "lzf"
direction: "input"
multiple: false
multiple_sep: ";"
- name: "SCVI options"
arguments:
- type: "integer"
name: "--n_hidden_nodes"
description: "Number of nodes per hidden layer."
info: null
default:
- 128
required: false
direction: "input"
multiple: false
multiple_sep: ";"
- type: "integer"
name: "--n_dimensions_latent_space"
description: "Dimensionality of the latent space."
info: null
default:
- 30
required: false
direction: "input"
multiple: false
multiple_sep: ";"
- type: "integer"
name: "--n_hidden_layers"
description: "Number of hidden layers used for encoder and decoder neural-networks."
info: null
default:
- 2
required: false
direction: "input"
multiple: false
multiple_sep: ";"
- type: "double"
name: "--dropout_rate"
description: "Dropout rate for the neural networks."
info: null
default:
- 0.1
required: false
direction: "input"
multiple: false
multiple_sep: ";"
- type: "string"
name: "--dispersion"
description: "Set the behavior for the dispersion for negative binomial distributions:\n\
- gene: dispersion parameter of negative binomial is constant per gene across\
\ cells\n- gene-batch: dispersion can differ between different batches\n- gene-label:\
\ dispersion can differ between different labels\n- gene-cell: dispersion can\
\ differ for every gene in every cell\n"
info: null
default:
- "gene"
required: false
choices:
- "gene"
- "gene-batch"
- "gene-label"
- "gene-cell"
direction: "input"
multiple: false
multiple_sep: ";"
- type: "string"
name: "--gene_likelihood"
description: "Model used to generate the expression data from a count-based likelihood\
\ distribution.\n- nb: Negative binomial distribution\n- zinb: Zero-inflated\
\ negative binomial distribution\n- poisson: Poisson distribution\n"
info: null
default:
- "nb"
required: false
choices:
- "nb"
- "zinb"
- "poisson"
direction: "input"
multiple: false
multiple_sep: ";"
- name: "Variational auto-encoder model options"
arguments:
- type: "string"
name: "--use_layer_normalization"
description: "Neural networks for which to enable layer normalization. \n"
info: null
default:
- "both"
required: false
choices:
- "encoder"
- "decoder"
- "none"
- "both"
direction: "input"
multiple: false
multiple_sep: ";"
- type: "string"
name: "--use_batch_normalization"
description: "Neural networks for which to enable batch normalization. \n"
info: null
default:
- "none"
required: false
choices:
- "encoder"
- "decoder"
- "none"
- "both"
direction: "input"
multiple: false
multiple_sep: ";"
- type: "boolean_false"
name: "--encode_covariates"
description: "Whether to concatenate covariates to expression in encoder"
info: null
direction: "input"
- type: "boolean_true"
name: "--deeply_inject_covariates"
description: "Whether to concatenate covariates into output of hidden layers in\
\ encoder/decoder. \nThis option only applies when n_layers > 1. The covariates\
\ are concatenated to\nthe input of subsequent hidden layers.\n"
info: null
direction: "input"
- type: "boolean_true"
name: "--use_observed_lib_size"
description: "Use observed library size for RNA as scaling factor in mean of conditional\
\ distribution.\n"
info: null
direction: "input"
- name: "Early stopping arguments"
arguments:
- type: "boolean"
name: "--early_stopping"
description: "Whether to perform early stopping with respect to the validation\
\ set."
info: null
required: false
direction: "input"
multiple: false
multiple_sep: ";"
- type: "string"
name: "--early_stopping_monitor"
description: "Metric logged during validation set epoch."
info: null
default:
- "elbo_validation"
required: false
choices:
- "elbo_validation"
- "reconstruction_loss_validation"
- "kl_local_validation"
direction: "input"
multiple: false
multiple_sep: ";"
- type: "integer"
name: "--early_stopping_patience"
description: "Number of validation epochs with no improvement after which training\
\ will be stopped."
info: null
default:
- 45
required: false
min: 1
direction: "input"
multiple: false
multiple_sep: ";"
- type: "double"
name: "--early_stopping_min_delta"
description: "Minimum change in the monitored quantity to qualify as an improvement,\
\ i.e. an absolute change of less than min_delta, will count as no improvement."
info: null
default:
- 0.0
required: false
min: 0.0
direction: "input"
multiple: false
multiple_sep: ";"
- name: "Learning parameters"
arguments:
- type: "integer"
name: "--max_epochs"
description: "Number of passes through the dataset, defaults to (20000 / number\
\ of cells) * 400 or 400; whichever is smallest."
info: null
required: false
direction: "input"
multiple: false
multiple_sep: ";"
- type: "boolean"
name: "--reduce_lr_on_plateau"
description: "Whether to monitor validation loss and reduce learning rate when\
\ validation set `lr_scheduler_metric` plateaus."
info: null
default:
- true
required: false
direction: "input"
multiple: false
multiple_sep: ";"
- type: "double"
name: "--lr_factor"
description: "Factor to reduce learning rate."
info: null
default:
- 0.6
required: false
min: 0.0
direction: "input"
multiple: false
multiple_sep: ";"
- type: "double"
name: "--lr_patience"
description: "Number of epochs with no improvement after which learning rate will\
\ be reduced."
info: null
default:
- 30.0
required: false
min: 0.0
direction: "input"
multiple: false
multiple_sep: ";"
- name: "Data validition"
arguments:
- type: "integer"
name: "--n_obs_min_count"
description: "Minimum number of cells threshold ensuring that every obs_batch\
\ category has sufficient observations (cells) for model training."
info: null
default:
- 0
required: false
direction: "input"
multiple: false
multiple_sep: ";"
- type: "integer"
name: "--n_var_min_count"
description: "Minimum number of genes threshold ensuring that every var_input\
\ filter has sufficient observations (genes) for model training."
info: null
default:
- 0
required: false
direction: "input"
multiple: false
multiple_sep: ";"
resources:
- type: "python_script"
path: "script.py"
is_executable: true
- type: "file"
path: "subset_vars.py"
- type: "file"
path: "compress_h5mu.py"
- type: "file"
path: "set_var_index.py"
- type: "file"
path: "nextflow_labels.config"
dest: "nextflow_labels.config"
description: "Performs scvi integration as done in the human lung cell atlas https://github.com/LungCellAtlas/HLCA"
test_resources:
- type: "python_script"
path: "test.py"
is_executable: true
- type: "file"
path: "pbmc_1k_protein_v3_mms.h5mu"
- type: "file"
path: "TS_Blood_filtered.h5mu"
info: null
status: "enabled"
scope:
image: "public"
target: "public"
license: "MIT"
links:
repository: "https://github.com/openpipelines-bio/openpipeline"
docker_registry: "ghcr.io"
runners:
- type: "executable"
id: "executable"
docker_setup_strategy: "ifneedbepullelsecachedbuild"
- type: "nextflow"
id: "nextflow"
directives:
label:
- "midcpu"
- "midmem"
- "gpu"
- "highdisk"
tag: "$id"
auto:
simplifyInput: true
simplifyOutput: false
transcript: false
publish: false
config:
labels:
mem1gb: "memory = 1000000000.B"
mem2gb: "memory = 2000000000.B"
mem5gb: "memory = 5000000000.B"
mem10gb: "memory = 10000000000.B"
mem20gb: "memory = 20000000000.B"
mem50gb: "memory = 50000000000.B"
mem100gb: "memory = 100000000000.B"
mem200gb: "memory = 200000000000.B"
mem500gb: "memory = 500000000000.B"
mem1tb: "memory = 1000000000000.B"
mem2tb: "memory = 2000000000000.B"
mem5tb: "memory = 5000000000000.B"
mem10tb: "memory = 10000000000000.B"
mem20tb: "memory = 20000000000000.B"
mem50tb: "memory = 50000000000000.B"
mem100tb: "memory = 100000000000000.B"
mem200tb: "memory = 200000000000000.B"
mem500tb: "memory = 500000000000000.B"
mem1gib: "memory = 1073741824.B"
mem2gib: "memory = 2147483648.B"
mem4gib: "memory = 4294967296.B"
mem8gib: "memory = 8589934592.B"
mem16gib: "memory = 17179869184.B"
mem32gib: "memory = 34359738368.B"
mem64gib: "memory = 68719476736.B"
mem128gib: "memory = 137438953472.B"
mem256gib: "memory = 274877906944.B"
mem512gib: "memory = 549755813888.B"
mem1tib: "memory = 1099511627776.B"
mem2tib: "memory = 2199023255552.B"
mem4tib: "memory = 4398046511104.B"
mem8tib: "memory = 8796093022208.B"
mem16tib: "memory = 17592186044416.B"
mem32tib: "memory = 35184372088832.B"
mem64tib: "memory = 70368744177664.B"
mem128tib: "memory = 140737488355328.B"
mem256tib: "memory = 281474976710656.B"
mem512tib: "memory = 562949953421312.B"
cpu1: "cpus = 1"
cpu2: "cpus = 2"
cpu5: "cpus = 5"
cpu10: "cpus = 10"
cpu20: "cpus = 20"
cpu50: "cpus = 50"
cpu100: "cpus = 100"
cpu200: "cpus = 200"
cpu500: "cpus = 500"
cpu1000: "cpus = 1000"
script:
- "includeConfig(\"nextflow_labels.config\")"
debug: false
container: "docker"
engines:
- type: "docker"
id: "docker"
image: "nvcr.io/nvidia/pytorch:25.05-py3"
target_registry: "images.viash-hub.com"
target_tag: "main"
namespace_separator: "/"
setup:
- type: "python"
user: false
packages:
- "anndata~=0.11.1"
- "mudata~=0.3.1"
- "scanpy~=1.10.4"
script:
- "exec(\"try:\\n import awkward\\nexcept ModuleNotFoundError:\\n exit(0)\\\
nelse: exit(1)\")"
upgrade: true
- type: "python"
user: false
packages:
- "jax[cuda]"
- "scvi-tools~=1.3.1"
upgrade: true
test_setup:
- type: "python"
user: false
packages:
- "viashpy==0.8.0"
upgrade: true
entrypoint: []
cmd: null
- type: "native"
id: "native"
build_info:
config: "src/integrate/scvi/config.vsh.yaml"
runner: "executable"
engine: "docker|native"
output: "target/executable/integrate/scvi"
executable: "target/executable/integrate/scvi/scvi"
viash_version: "0.9.4"
git_commit: "173327cc5670aa8bd5cf473827de80b602c90092"
git_remote: "https://github.com/openpipelines-bio/openpipeline"
git_tag: "0.2.0-2055-g173327cc"
package_config:
name: "openpipeline"
version: "main"
summary: "Best-practice workflows for single-cell multi-omics analyses.\n"
description: "OpenPipelines are extensible single cell analysis pipelines for reproducible\
\ and large-scale single cell processing using [Viash](https://viash.io) and [Nextflow](https://www.nextflow.io/).\n\
\nIn terms of workflows, the following has been made available, but keep in mind\
\ that\nindividual tools and functionality can be executed as standalone components\
\ as well.\n\n * Demultiplexing: conversion of raw sequencing data to FASTQ objects.\n\
\ * Ingestion: Read mapping and generating a count matrix.\n * Single sample\
\ processing: cell filtering and doublet detection.\n * Multisample processing:\
\ Count transformation, normalization, QC metric calulations.\n * Integration:\
\ Clustering, integration and batch correction using single and multimodal methods.\n\
\ * Downstream analysis workflows\n"
info:
test_resources:
- type: "s3"
path: "s3://openpipelines-data"
dest: "resources_test"
nextflow_labels_ci:
- path: "src/workflows/utils/labels_ci.config"
description: "Adds the correct memory and CPU labels when running on the Viash\
\ Hub CI."
viash_version: "0.9.4"
source: "src"
target: "target"
config_mods:
- ".resources += {path: '/src/workflows/utils/labels.config', dest: 'nextflow_labels.config'}\n\
.runners[.type == 'nextflow'].config.script := 'includeConfig(\"nextflow_labels.config\"\
)'\n"
- ".engines += { type: \"native\" }"
- ".engines[.type == 'docker'].target_registry := 'images.viash-hub.com'"
- ".engines[.type == 'docker'].target_tag := 'main'"
keywords:
- "single-cell"
- "multimodal"
license: "MIT"
organization: "vsh"
links:
repository: "https://github.com/openpipelines-bio/openpipeline"
docker_registry: "ghcr.io"
homepage: "https://openpipelines.bio"
documentation: "https://openpipelines.bio/fundamentals"
issue_tracker: "https://github.com/openpipelines-bio/openpipeline/issues"

View File

@@ -0,0 +1,87 @@
import shutil
from anndata import AnnData
from mudata import write_h5ad
from h5py import File as H5File
from h5py import Group, Dataset
from pathlib import Path
from typing import Union, Literal
from functools import partial
def compress_h5mu(
input_path: Union[str, Path],
output_path: Union[str, Path],
compression: Union[Literal["gzip"], Literal["lzf"]],
):
input_path, output_path = str(input_path), str(output_path)
def copy_attributes(in_object, out_object):
for key, value in in_object.attrs.items():
out_object.attrs[key] = value
def visit_path(
output_h5: H5File,
compression: Union[Literal["gzip"], Literal["lzf"]],
name: str,
object: Union[Group, Dataset],
):
if isinstance(object, Group):
new_group = output_h5.create_group(name)
copy_attributes(object, new_group)
elif isinstance(object, Dataset):
# Compression only works for non-scalar Dataset objects
# Scalar objects dont have a shape defined
if not object.compression and object.shape not in [None, ()]:
new_dataset = output_h5.create_dataset(
name, data=object, compression=compression
)
copy_attributes(object, new_dataset)
else:
output_h5.copy(object, name)
else:
raise NotImplementedError(
f"Could not copy element {name}, "
f"type has not been implemented yet: {type(object)}"
)
with (
H5File(input_path, "r") as input_h5,
H5File(output_path, "w", userblock_size=512) as output_h5,
):
copy_attributes(input_h5, output_h5)
input_h5.visititems(partial(visit_path, output_h5, compression))
with open(input_path, "rb") as input_bytes:
# Mudata puts metadata like this in the first 512 bytes:
# MuData (format-version=0.1.0;creator=muon;creator-version=0.2.0)
# See mudata/_core/io.py, read_h5mu() function
starting_metadata = input_bytes.read(100)
# The metadata is padded with extra null bytes up until 512 bytes
truncate_location = starting_metadata.find(b"\x00")
starting_metadata = starting_metadata[:truncate_location]
with open(output_path, "br+") as f:
nbytes = f.write(starting_metadata)
f.write(b"\0" * (512 - nbytes))
def write_h5ad_to_h5mu_with_compression(
output_file: Union[str, Path],
h5mu: Union[str, Path],
modality_name: str,
modality_data: AnnData,
output_compression=None,
):
output_file = Path(output_file)
h5mu = Path(h5mu)
output_file_uncompressed = (
output_file.with_name(output_file.stem + "_uncompressed.h5mu")
if output_compression
else output_file
)
shutil.copyfile(h5mu, output_file_uncompressed)
write_h5ad(filename=output_file_uncompressed, mod=modality_name, data=modality_data)
if output_compression:
compress_h5mu(
output_file_uncompressed, output_file, compression=output_compression
)
output_file_uncompressed.unlink()

View File

@@ -0,0 +1,48 @@
process {
// Default resources for components that hardly do any processing
memory = { 2.GB * task.attempt }
cpus = 1
// Retry for exit codes that have something to do with memory issues
errorStrategy = { task.exitStatus in 137..140 ? 'retry' : 'terminate' }
maxRetries = 3
// The memory a task is assinged increases with each attempt
// uncomment the line below and adjust the value to set a global upper limit on the memory.
// resourceLimits = [ memory: 240.Gb ]
// CPU resources
withLabel: singlecpu { cpus = 1 }
withLabel: lowcpu { cpus = 4 }
withLabel: midcpu { cpus = 10 }
withLabel: highcpu { cpus = 20 }
// Memory resources
withLabel: lowmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 4.GB * task.attempt } }
withLabel: midmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 25.GB * task.attempt } }
withLabel: highmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 50.GB * task.attempt } }
withLabel: veryhighmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 75.GB * task.attempt } }
// Disk space
withLabel: lowdisk {
disk = {process.disk ? process.disk : null}
}
withLabel: middisk {
disk = {process.disk ? process.disk : null}
}
withLabel: highdisk {
disk = {process.disk ? process.disk : null}
}
withLabel: veryhighdisk {
disk = {process.disk ? process.disk : null}
}
// NOTE: The above labels intentionally do not have an effect by default.
// The user should set the disk space requirements by adding the following
// to the compute environment:
//
// withLabel: lowdisk { disk = { 20.GB * task.attempt } }
// withLabel: middisk { disk = { 100.GB * task.attempt } }
// withLabel: highdisk { disk = { 200.GB * task.attempt } }
// withLabel: veryhighdisk { disk = { 500.GB * task.attempt } }
}

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,24 @@
import anndata as ad
import re
def set_var_index(adata: ad.AnnData, var_name: str | None = None) -> ad.AnnData:
"""Sanitize gene names and set the index of the .var DataFrame.
Parameters
----------
adata : AnnData
Annotated data object
var_name : str | None
Name of the column in `adata.var` that contains the gene names, if None, the existing index will be sanitized but not replaced.
Returns
-------
AnnData
Copy of `adata` with sanitized and replaced index
"""
if var_name:
adata.var.index = [re.sub("\\.[0-9]+$", "", s) for s in adata.var[var_name]]
else:
adata.var.index = [re.sub("\\.[0-9]+$", "", s) for s in adata.var.index]
return adata

View File

@@ -0,0 +1,31 @@
def subset_vars(adata, subset_col):
"""Subset AnnData object on highly variable genes
Parameters
----------
adata : AnnData
Annotated data object
subset_col : str
Name of the boolean column in `adata.var` that contains the information if features should be used or not
Returns
-------
AnnData
Copy of `adata` with subsetted features
"""
if subset_col not in adata.var.columns:
raise ValueError(
f"Requested to use .var column '{subset_col}' as a selection of genes, but the column is not available."
)
if adata.var[subset_col].dtype == "boolean":
assert adata.var[subset_col].isna().sum() == 0, (
f"The .var column `{subset_col}` contains NaN values. Can not subset data."
)
adata.var[subset_col] = adata.var[subset_col].astype("bool")
assert adata.var[subset_col].dtype == "bool", (
f"Expected dtype of .var column '{subset_col}' to be `bool`, but found {adata.var[subset_col].dtype}. Can not subset data."
)
return adata[:, adata.var[subset_col]].copy()

View File

@@ -0,0 +1,395 @@
name: "totalvi"
namespace: "integrate"
version: "main"
authors:
- name: "Vladimir Shitov"
info:
role: "Contributor"
links:
email: "vladimir.shitov@helmholtz-muenchen.de"
github: "vladimirshitov"
orcid: "0000-0002-1960-8812"
linkedin: "vladimir-shitov-9a659513b"
organizations:
- name: "Helmholtz Munich"
href: "https://www.helmholtz-munich.de"
role: "PhD Candidate"
argument_groups:
- name: "Inputs"
arguments:
- type: "file"
name: "--input"
alternatives:
- "-i"
description: "Input h5mu file with query data to integrate with reference."
info: null
must_exist: true
create_parent: true
required: true
direction: "input"
multiple: false
multiple_sep: ";"
- type: "file"
name: "--reference"
alternatives:
- "-r"
description: "Input h5mu file with reference data to train the TOTALVI model."
info: null
must_exist: true
create_parent: true
required: true
direction: "input"
multiple: false
multiple_sep: ";"
- type: "boolean_true"
name: "--force_retrain"
alternatives:
- "-f"
description: "If true, retrain the model and save it to reference_model_path"
info: null
direction: "input"
- type: "string"
name: "--query_modality"
info: null
default:
- "rna"
required: false
direction: "input"
multiple: false
multiple_sep: ";"
- type: "string"
name: "--query_proteins_modality"
description: "Name of the modality in the input (query) h5mu file containing protein\
\ data"
info: null
required: false
direction: "input"
multiple: false
multiple_sep: ";"
- type: "string"
name: "--reference_modality"
info: null
default:
- "rna"
required: false
direction: "input"
multiple: false
multiple_sep: ";"
- type: "string"
name: "--reference_proteins_modality"
description: "Name of the modality containing proteins in the reference"
info: null
default:
- "prot"
required: false
direction: "input"
multiple: false
multiple_sep: ";"
- type: "string"
name: "--input_layer"
description: "Input layer to use. If None, X is used"
info: null
required: false
direction: "input"
multiple: false
multiple_sep: ";"
- type: "string"
name: "--obs_batch"
description: "Column name discriminating between your batches."
info: null
default:
- "sample_id"
required: false
direction: "input"
multiple: false
multiple_sep: ";"
- type: "string"
name: "--var_input"
description: ".var column containing highly variable genes. By default, do not\
\ subset genes."
info: null
required: false
direction: "input"
multiple: false
multiple_sep: ";"
- name: "Outputs"
arguments:
- type: "file"
name: "--output"
alternatives:
- "-o"
description: "Output h5mu file."
info: null
must_exist: true
create_parent: true
required: true
direction: "output"
multiple: false
multiple_sep: ";"
- type: "string"
name: "--obsm_output"
description: "In which .obsm slot to store the resulting integrated embedding."
info: null
default:
- "X_integrated_totalvi"
required: false
direction: "input"
multiple: false
multiple_sep: ";"
- type: "string"
name: "--obsm_normalized_rna_output"
description: "In which .obsm slot to store the normalized RNA from TOTALVI."
info: null
default:
- "X_totalvi_normalized_rna"
required: false
direction: "input"
multiple: false
multiple_sep: ";"
- type: "string"
name: "--obsm_normalized_protein_output"
description: "In which .obsm slot to store the normalized protein data from TOTALVI."
info: null
default:
- "X_totalvi_normalized_protein"
required: false
direction: "input"
multiple: false
multiple_sep: ";"
- type: "file"
name: "--reference_model_path"
description: "Directory with the reference model. If not exists, trained model\
\ will be saved there"
info: null
default:
- "totalvi_model_reference"
must_exist: true
create_parent: true
required: false
direction: "output"
multiple: false
multiple_sep: ";"
- type: "file"
name: "--query_model_path"
description: "Directory, where the query model will be saved"
info: null
default:
- "totalvi_model_query"
must_exist: true
create_parent: true
required: false
direction: "output"
multiple: false
multiple_sep: ";"
- name: "Learning parameters"
arguments:
- type: "integer"
name: "--max_epochs"
description: "Number of passes through the dataset"
info: null
default:
- 400
required: false
direction: "input"
multiple: false
multiple_sep: ";"
- type: "integer"
name: "--max_query_epochs"
description: "Number of passes through the dataset, when fine-tuning model for\
\ query"
info: null
default:
- 200
required: false
direction: "input"
multiple: false
multiple_sep: ";"
- type: "double"
name: "--weight_decay"
description: "Weight decay, when fine-tuning model for query"
info: null
default:
- 0.0
required: false
direction: "input"
multiple: false
multiple_sep: ";"
resources:
- type: "python_script"
path: "script.py"
is_executable: true
- type: "file"
path: "setup_logger.py"
- type: "file"
path: "nextflow_labels.config"
dest: "nextflow_labels.config"
description: "Performs mapping to the reference by totalvi model: https://docs.scvi-tools.org/en/stable/tutorials/notebooks/scarches_scvi_tools.html#Reference-mapping-with-TOTALVI"
test_resources:
- type: "python_script"
path: "test.py"
is_executable: true
- type: "file"
path: "pbmc_1k_protein_v3_mms.h5mu"
info: null
status: "enabled"
scope:
image: "public"
target: "public"
license: "MIT"
links:
repository: "https://github.com/openpipelines-bio/openpipeline"
docker_registry: "ghcr.io"
runners:
- type: "executable"
id: "executable"
docker_setup_strategy: "ifneedbepullelsecachedbuild"
- type: "nextflow"
id: "nextflow"
directives:
label:
- "highmem"
- "highcpu"
- "highdisk"
tag: "$id"
auto:
simplifyInput: true
simplifyOutput: false
transcript: false
publish: false
config:
labels:
mem1gb: "memory = 1000000000.B"
mem2gb: "memory = 2000000000.B"
mem5gb: "memory = 5000000000.B"
mem10gb: "memory = 10000000000.B"
mem20gb: "memory = 20000000000.B"
mem50gb: "memory = 50000000000.B"
mem100gb: "memory = 100000000000.B"
mem200gb: "memory = 200000000000.B"
mem500gb: "memory = 500000000000.B"
mem1tb: "memory = 1000000000000.B"
mem2tb: "memory = 2000000000000.B"
mem5tb: "memory = 5000000000000.B"
mem10tb: "memory = 10000000000000.B"
mem20tb: "memory = 20000000000000.B"
mem50tb: "memory = 50000000000000.B"
mem100tb: "memory = 100000000000000.B"
mem200tb: "memory = 200000000000000.B"
mem500tb: "memory = 500000000000000.B"
mem1gib: "memory = 1073741824.B"
mem2gib: "memory = 2147483648.B"
mem4gib: "memory = 4294967296.B"
mem8gib: "memory = 8589934592.B"
mem16gib: "memory = 17179869184.B"
mem32gib: "memory = 34359738368.B"
mem64gib: "memory = 68719476736.B"
mem128gib: "memory = 137438953472.B"
mem256gib: "memory = 274877906944.B"
mem512gib: "memory = 549755813888.B"
mem1tib: "memory = 1099511627776.B"
mem2tib: "memory = 2199023255552.B"
mem4tib: "memory = 4398046511104.B"
mem8tib: "memory = 8796093022208.B"
mem16tib: "memory = 17592186044416.B"
mem32tib: "memory = 35184372088832.B"
mem64tib: "memory = 70368744177664.B"
mem128tib: "memory = 140737488355328.B"
mem256tib: "memory = 281474976710656.B"
mem512tib: "memory = 562949953421312.B"
cpu1: "cpus = 1"
cpu2: "cpus = 2"
cpu5: "cpus = 5"
cpu10: "cpus = 10"
cpu20: "cpus = 20"
cpu50: "cpus = 50"
cpu100: "cpus = 100"
cpu200: "cpus = 200"
cpu500: "cpus = 500"
cpu1000: "cpus = 1000"
script:
- "includeConfig(\"nextflow_labels.config\")"
debug: false
container: "docker"
engines:
- type: "docker"
id: "docker"
image: "nvcr.io/nvidia/pytorch:25.05-py3"
target_registry: "images.viash-hub.com"
target_tag: "main"
namespace_separator: "/"
setup:
- type: "python"
user: false
packages:
- "anndata~=0.11.1"
- "mudata~=0.3.1"
- "jax[cuda]"
- "scvi-tools~=1.3.1"
script:
- "exec(\"try:\\n import awkward\\nexcept ModuleNotFoundError:\\n exit(0)\\\
nelse: exit(1)\")"
upgrade: true
test_setup:
- type: "python"
user: false
packages:
- "viashpy==0.8.0"
upgrade: true
entrypoint: []
cmd: null
- type: "native"
id: "native"
build_info:
config: "src/integrate/totalvi/config.vsh.yaml"
runner: "executable"
engine: "docker|native"
output: "target/executable/integrate/totalvi"
executable: "target/executable/integrate/totalvi/totalvi"
viash_version: "0.9.4"
git_commit: "173327cc5670aa8bd5cf473827de80b602c90092"
git_remote: "https://github.com/openpipelines-bio/openpipeline"
git_tag: "0.2.0-2055-g173327cc"
package_config:
name: "openpipeline"
version: "main"
summary: "Best-practice workflows for single-cell multi-omics analyses.\n"
description: "OpenPipelines are extensible single cell analysis pipelines for reproducible\
\ and large-scale single cell processing using [Viash](https://viash.io) and [Nextflow](https://www.nextflow.io/).\n\
\nIn terms of workflows, the following has been made available, but keep in mind\
\ that\nindividual tools and functionality can be executed as standalone components\
\ as well.\n\n * Demultiplexing: conversion of raw sequencing data to FASTQ objects.\n\
\ * Ingestion: Read mapping and generating a count matrix.\n * Single sample\
\ processing: cell filtering and doublet detection.\n * Multisample processing:\
\ Count transformation, normalization, QC metric calulations.\n * Integration:\
\ Clustering, integration and batch correction using single and multimodal methods.\n\
\ * Downstream analysis workflows\n"
info:
test_resources:
- type: "s3"
path: "s3://openpipelines-data"
dest: "resources_test"
nextflow_labels_ci:
- path: "src/workflows/utils/labels_ci.config"
description: "Adds the correct memory and CPU labels when running on the Viash\
\ Hub CI."
viash_version: "0.9.4"
source: "src"
target: "target"
config_mods:
- ".resources += {path: '/src/workflows/utils/labels.config', dest: 'nextflow_labels.config'}\n\
.runners[.type == 'nextflow'].config.script := 'includeConfig(\"nextflow_labels.config\"\
)'\n"
- ".engines += { type: \"native\" }"
- ".engines[.type == 'docker'].target_registry := 'images.viash-hub.com'"
- ".engines[.type == 'docker'].target_tag := 'main'"
keywords:
- "single-cell"
- "multimodal"
license: "MIT"
organization: "vsh"
links:
repository: "https://github.com/openpipelines-bio/openpipeline"
docker_registry: "ghcr.io"
homepage: "https://openpipelines.bio"
documentation: "https://openpipelines.bio/fundamentals"
issue_tracker: "https://github.com/openpipelines-bio/openpipeline/issues"

View File

@@ -0,0 +1,48 @@
process {
// Default resources for components that hardly do any processing
memory = { 2.GB * task.attempt }
cpus = 1
// Retry for exit codes that have something to do with memory issues
errorStrategy = { task.exitStatus in 137..140 ? 'retry' : 'terminate' }
maxRetries = 3
// The memory a task is assinged increases with each attempt
// uncomment the line below and adjust the value to set a global upper limit on the memory.
// resourceLimits = [ memory: 240.Gb ]
// CPU resources
withLabel: singlecpu { cpus = 1 }
withLabel: lowcpu { cpus = 4 }
withLabel: midcpu { cpus = 10 }
withLabel: highcpu { cpus = 20 }
// Memory resources
withLabel: lowmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 4.GB * task.attempt } }
withLabel: midmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 25.GB * task.attempt } }
withLabel: highmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 50.GB * task.attempt } }
withLabel: veryhighmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 75.GB * task.attempt } }
// Disk space
withLabel: lowdisk {
disk = {process.disk ? process.disk : null}
}
withLabel: middisk {
disk = {process.disk ? process.disk : null}
}
withLabel: highdisk {
disk = {process.disk ? process.disk : null}
}
withLabel: veryhighdisk {
disk = {process.disk ? process.disk : null}
}
// NOTE: The above labels intentionally do not have an effect by default.
// The user should set the disk space requirements by adding the following
// to the compute environment:
//
// withLabel: lowdisk { disk = { 20.GB * task.attempt } }
// withLabel: middisk { disk = { 100.GB * task.attempt } }
// withLabel: highdisk { disk = { 200.GB * task.attempt } }
// withLabel: veryhighdisk { disk = { 500.GB * task.attempt } }
}

View File

@@ -0,0 +1,12 @@
def setup_logger():
import logging
from sys import stdout
logger = logging.getLogger()
logger.setLevel(logging.INFO)
console_handler = logging.StreamHandler(stdout)
logFormatter = logging.Formatter("%(asctime)s %(levelname)-8s %(message)s")
console_handler.setFormatter(logFormatter)
logger.addHandler(console_handler)
return logger

File diff suppressed because it is too large Load Diff