Build branch main with version main (173327cc)

Build pipeline: vsh-ci-build-template-k4qzr Source commit: 173327cc56 Source message: Cellranger multi conversion: fix combined AB + CB probe experiments (#1062)
2025-08-22 08:50:18 +00:00
commit cd5554d22f
2226 changed files with 1154442 additions and 0 deletions
--- a/target/executable/integrate/harmonypy/.config.vsh.yaml
+++ b/target/executable/integrate/harmonypy/.config.vsh.yaml
@@ -0,0 +1,322 @@
+name: "harmonypy"
+namespace: "integrate"
+version: "main"
+authors:
+- name: "Dries Schaumont"
+  roles:
+  - "maintainer"
+  info:
+    role: "Core Team Member"
+    links:
+      email: "dries@data-intuitive.com"
+      github: "DriesSchaumont"
+      orcid: "0000-0002-4389-0440"
+      linkedin: "dries-schaumont"
+    organizations:
+    - name: "Data Intuitive"
+      href: "https://www.data-intuitive.com"
+      role: "Data Scientist"
+- name: "Robrecht Cannoodt"
+  roles:
+  - "contributor"
+  info:
+    role: "Core Team Member"
+    links:
+      email: "robrecht@data-intuitive.com"
+      github: "rcannood"
+      orcid: "0000-0003-3641-729X"
+      linkedin: "robrechtcannoodt"
+    organizations:
+    - name: "Data Intuitive"
+      href: "https://www.data-intuitive.com"
+      role: "Data Science Engineer"
+    - name: "Open Problems"
+      href: "https://openproblems.bio"
+      role: "Core Member"
+argument_groups:
+- name: "Arguments"
+  arguments:
+  - type: "file"
+    name: "--input"
+    alternatives:
+    - "-i"
+    description: "Input h5mu file"
+    info: null
+    must_exist: true
+    create_parent: true
+    required: true
+    direction: "input"
+    multiple: false
+    multiple_sep: ";"
+  - type: "file"
+    name: "--output"
+    alternatives:
+    - "-o"
+    description: "Output h5mu file."
+    info: null
+    must_exist: true
+    create_parent: true
+    required: true
+    direction: "output"
+    multiple: false
+    multiple_sep: ";"
+  - type: "string"
+    name: "--modality"
+    description: "Which modality from the input MuData file to process.\n"
+    info: null
+    default:
+    - "rna"
+    required: false
+    direction: "input"
+    multiple: false
+    multiple_sep: ";"
+  - type: "string"
+    name: "--obsm_input"
+    description: "Which .obsm slot to use as a starting PCA embedding."
+    info: null
+    default:
+    - "X_pca"
+    required: false
+    direction: "input"
+    multiple: false
+    multiple_sep: ";"
+  - type: "string"
+    name: "--obsm_output"
+    description: "In which .obsm slot to store the resulting integrated embedding."
+    info: null
+    default:
+    - "X_pca_integrated"
+    required: false
+    direction: "input"
+    multiple: false
+    multiple_sep: ";"
+  - type: "double"
+    name: "--theta"
+    description: "Diversity clustering penalty parameter. Can be set as a single value\
+      \ for all batch observations or as multiple values, one for each observation\
+      \ in the batches defined by --obs_covariates. theta=0 does not encourage any\
+      \ diversity. Larger values of theta result in more diverse clusters."
+    info: null
+    default:
+    - 2.0
+    required: false
+    direction: "input"
+    multiple: true
+    multiple_sep: ";"
+  - type: "string"
+    name: "--obs_covariates"
+    description: "The .obs field(s) that define the covariate(s) to regress out."
+    info: null
+    example:
+    - "batch"
+    - "sample"
+    required: true
+    direction: "input"
+    multiple: true
+    multiple_sep: ";"
+  - type: "string"
+    name: "--output_compression"
+    description: "Compression format to use for the output AnnData and/or Mudata objects.\n\
+      By default no compression is applied.\n"
+    info: null
+    example:
+    - "gzip"
+    required: false
+    choices:
+    - "gzip"
+    - "lzf"
+    direction: "input"
+    multiple: false
+    multiple_sep: ";"
+resources:
+- type: "python_script"
+  path: "script.py"
+  is_executable: true
+- type: "file"
+  path: "setup_logger.py"
+- type: "file"
+  path: "compress_h5mu.py"
+- type: "file"
+  path: "nextflow_labels.config"
+  dest: "nextflow_labels.config"
+description: "Performs Harmony integration based as described in https://github.com/immunogenomics/harmony.\
+  \ Based on an implementation in python from https://github.com/slowkow/harmonypy"
+test_resources:
+- type: "python_script"
+  path: "test.py"
+  is_executable: true
+- type: "file"
+  path: "pbmc_1k_protein_v3_mms.h5mu"
+info: null
+status: "enabled"
+scope:
+  image: "public"
+  target: "public"
+license: "MIT"
+links:
+  repository: "https://github.com/openpipelines-bio/openpipeline"
+  docker_registry: "ghcr.io"
+runners:
+- type: "executable"
+  id: "executable"
+  docker_setup_strategy: "ifneedbepullelsecachedbuild"
+- type: "nextflow"
+  id: "nextflow"
+  directives:
+    label:
+    - "highmem"
+    - "highcpu"
+    - "highdisk"
+    tag: "$id"
+  auto:
+    simplifyInput: true
+    simplifyOutput: false
+    transcript: false
+    publish: false
+  config:
+    labels:
+      mem1gb: "memory = 1000000000.B"
+      mem2gb: "memory = 2000000000.B"
+      mem5gb: "memory = 5000000000.B"
+      mem10gb: "memory = 10000000000.B"
+      mem20gb: "memory = 20000000000.B"
+      mem50gb: "memory = 50000000000.B"
+      mem100gb: "memory = 100000000000.B"
+      mem200gb: "memory = 200000000000.B"
+      mem500gb: "memory = 500000000000.B"
+      mem1tb: "memory = 1000000000000.B"
+      mem2tb: "memory = 2000000000000.B"
+      mem5tb: "memory = 5000000000000.B"
+      mem10tb: "memory = 10000000000000.B"
+      mem20tb: "memory = 20000000000000.B"
+      mem50tb: "memory = 50000000000000.B"
+      mem100tb: "memory = 100000000000000.B"
+      mem200tb: "memory = 200000000000000.B"
+      mem500tb: "memory = 500000000000000.B"
+      mem1gib: "memory = 1073741824.B"
+      mem2gib: "memory = 2147483648.B"
+      mem4gib: "memory = 4294967296.B"
+      mem8gib: "memory = 8589934592.B"
+      mem16gib: "memory = 17179869184.B"
+      mem32gib: "memory = 34359738368.B"
+      mem64gib: "memory = 68719476736.B"
+      mem128gib: "memory = 137438953472.B"
+      mem256gib: "memory = 274877906944.B"
+      mem512gib: "memory = 549755813888.B"
+      mem1tib: "memory = 1099511627776.B"
+      mem2tib: "memory = 2199023255552.B"
+      mem4tib: "memory = 4398046511104.B"
+      mem8tib: "memory = 8796093022208.B"
+      mem16tib: "memory = 17592186044416.B"
+      mem32tib: "memory = 35184372088832.B"
+      mem64tib: "memory = 70368744177664.B"
+      mem128tib: "memory = 140737488355328.B"
+      mem256tib: "memory = 281474976710656.B"
+      mem512tib: "memory = 562949953421312.B"
+      cpu1: "cpus = 1"
+      cpu2: "cpus = 2"
+      cpu5: "cpus = 5"
+      cpu10: "cpus = 10"
+      cpu20: "cpus = 20"
+      cpu50: "cpus = 50"
+      cpu100: "cpus = 100"
+      cpu200: "cpus = 200"
+      cpu500: "cpus = 500"
+      cpu1000: "cpus = 1000"
+    script:
+    - "includeConfig(\"nextflow_labels.config\")"
+  debug: false
+  container: "docker"
+engines:
+- type: "docker"
+  id: "docker"
+  image: "python:3.12-slim"
+  target_registry: "images.viash-hub.com"
+  target_tag: "main"
+  namespace_separator: "/"
+  setup:
+  - type: "apt"
+    packages:
+    - "procps"
+    interactive: false
+  - type: "python"
+    user: false
+    packages:
+    - "anndata~=0.11.1"
+    - "mudata~=0.3.1"
+    - "scanpy~=1.10.4"
+    - "harmonypy~=0.0.6"
+    script:
+    - "exec(\"try:\\n  import awkward\\nexcept ModuleNotFoundError:\\n  exit(0)\\\
+      nelse:  exit(1)\")"
+    upgrade: true
+  test_setup:
+  - type: "apt"
+    packages:
+    - "git"
+    interactive: false
+  - type: "python"
+    user: false
+    packages:
+    - "viashpy==0.8.0"
+    github:
+    - "openpipelines-bio/core#subdirectory=packages/python/openpipeline_testutils"
+    upgrade: true
+  entrypoint: []
+  cmd: null
+- type: "native"
+  id: "native"
+build_info:
+  config: "src/integrate/harmonypy/config.vsh.yaml"
+  runner: "executable"
+  engine: "docker|native"
+  output: "target/executable/integrate/harmonypy"
+  executable: "target/executable/integrate/harmonypy/harmonypy"
+  viash_version: "0.9.4"
+  git_commit: "173327cc5670aa8bd5cf473827de80b602c90092"
+  git_remote: "https://github.com/openpipelines-bio/openpipeline"
+  git_tag: "0.2.0-2055-g173327cc"
+package_config:
+  name: "openpipeline"
+  version: "main"
+  summary: "Best-practice workflows for single-cell multi-omics analyses.\n"
+  description: "OpenPipelines are extensible single cell analysis pipelines for reproducible\
+    \ and large-scale single cell processing using [Viash](https://viash.io) and [Nextflow](https://www.nextflow.io/).\n\
+    \nIn terms of workflows, the following has been made available, but keep in mind\
+    \ that\nindividual tools and functionality can be executed as standalone components\
+    \ as well.\n\n  * Demultiplexing: conversion of raw sequencing data to FASTQ objects.\n\
+    \  * Ingestion: Read mapping and generating a count matrix.\n  * Single sample\
+    \ processing: cell filtering and doublet detection.\n  * Multisample processing:\
+    \ Count transformation, normalization, QC metric calulations.\n  * Integration:\
+    \ Clustering, integration and batch correction using single and multimodal methods.\n\
+    \  * Downstream analysis workflows\n"
+  info:
+    test_resources:
+    - type: "s3"
+      path: "s3://openpipelines-data"
+      dest: "resources_test"
+    nextflow_labels_ci:
+    - path: "src/workflows/utils/labels_ci.config"
+      description: "Adds the correct memory and CPU labels when running on the Viash\
+        \ Hub CI."
+  viash_version: "0.9.4"
+  source: "src"
+  target: "target"
+  config_mods:
+  - ".resources += {path: '/src/workflows/utils/labels.config', dest: 'nextflow_labels.config'}\n\
+    .runners[.type == 'nextflow'].config.script := 'includeConfig(\"nextflow_labels.config\"\
+    )'\n"
+  - ".engines += { type: \"native\" }"
+  - ".engines[.type == 'docker'].target_registry := 'images.viash-hub.com'"
+  - ".engines[.type == 'docker'].target_tag := 'main'"
+  keywords:
+  - "single-cell"
+  - "multimodal"
+  license: "MIT"
+  organization: "vsh"
+  links:
+    repository: "https://github.com/openpipelines-bio/openpipeline"
+    docker_registry: "ghcr.io"
+    homepage: "https://openpipelines.bio"
+    documentation: "https://openpipelines.bio/fundamentals"
+    issue_tracker: "https://github.com/openpipelines-bio/openpipeline/issues"
--- a/target/executable/integrate/harmonypy/compress_h5mu.py
+++ b/target/executable/integrate/harmonypy/compress_h5mu.py
@@ -0,0 +1,87 @@
+import shutil
+from anndata import AnnData
+from mudata import write_h5ad
+from h5py import File as H5File
+from h5py import Group, Dataset
+from pathlib import Path
+from typing import Union, Literal
+from functools import partial
+
+
+def compress_h5mu(
+    input_path: Union[str, Path],
+    output_path: Union[str, Path],
+    compression: Union[Literal["gzip"], Literal["lzf"]],
+):
+    input_path, output_path = str(input_path), str(output_path)
+
+    def copy_attributes(in_object, out_object):
+        for key, value in in_object.attrs.items():
+            out_object.attrs[key] = value
+
+    def visit_path(
+        output_h5: H5File,
+        compression: Union[Literal["gzip"], Literal["lzf"]],
+        name: str,
+        object: Union[Group, Dataset],
+    ):
+        if isinstance(object, Group):
+            new_group = output_h5.create_group(name)
+            copy_attributes(object, new_group)
+        elif isinstance(object, Dataset):
+            # Compression only works for non-scalar Dataset objects
+            # Scalar objects dont have a shape defined
+            if not object.compression and object.shape not in [None, ()]:
+                new_dataset = output_h5.create_dataset(
+                    name, data=object, compression=compression
+                )
+                copy_attributes(object, new_dataset)
+            else:
+                output_h5.copy(object, name)
+        else:
+            raise NotImplementedError(
+                f"Could not copy element {name}, "
+                f"type has not been implemented yet: {type(object)}"
+            )
+
+    with (
+        H5File(input_path, "r") as input_h5,
+        H5File(output_path, "w", userblock_size=512) as output_h5,
+    ):
+        copy_attributes(input_h5, output_h5)
+        input_h5.visititems(partial(visit_path, output_h5, compression))
+
+    with open(input_path, "rb") as input_bytes:
+        # Mudata puts metadata like this in the first 512 bytes:
+        # MuData (format-version=0.1.0;creator=muon;creator-version=0.2.0)
+        # See mudata/_core/io.py, read_h5mu() function
+        starting_metadata = input_bytes.read(100)
+        # The metadata is padded with extra null bytes up until 512 bytes
+        truncate_location = starting_metadata.find(b"\x00")
+        starting_metadata = starting_metadata[:truncate_location]
+    with open(output_path, "br+") as f:
+        nbytes = f.write(starting_metadata)
+        f.write(b"\0" * (512 - nbytes))
+
+
+def write_h5ad_to_h5mu_with_compression(
+    output_file: Union[str, Path],
+    h5mu: Union[str, Path],
+    modality_name: str,
+    modality_data: AnnData,
+    output_compression=None,
+):
+    output_file = Path(output_file)
+    h5mu = Path(h5mu)
+    output_file_uncompressed = (
+        output_file.with_name(output_file.stem + "_uncompressed.h5mu")
+        if output_compression
+        else output_file
+    )
+    shutil.copyfile(h5mu, output_file_uncompressed)
+    write_h5ad(filename=output_file_uncompressed, mod=modality_name, data=modality_data)
+    if output_compression:
+        compress_h5mu(
+            output_file_uncompressed, output_file, compression=output_compression
+        )
+        output_file_uncompressed.unlink()
--- a/target/executable/integrate/harmonypy/harmonypy
+++ b/target/executable/integrate/harmonypy/harmonypy
--- a/target/executable/integrate/harmonypy/nextflow_labels.config
+++ b/target/executable/integrate/harmonypy/nextflow_labels.config
@@ -0,0 +1,48 @@
+process {
+  // Default resources for components that hardly do any processing
+  memory = { 2.GB * task.attempt }
+  cpus = 1
+
+  // Retry for exit codes that have something to do with memory issues
+  errorStrategy = { task.exitStatus in 137..140 ? 'retry' : 'terminate' }
+  maxRetries = 3
+
+  // The memory a task is assinged increases with each attempt
+  // uncomment the line below and adjust the value to set a global upper limit on the memory.
+  // resourceLimits = [ memory: 240.Gb ] 
+
+  // CPU resources
+  withLabel: singlecpu { cpus = 1 }
+  withLabel: lowcpu { cpus = 4 }
+  withLabel: midcpu { cpus = 10 }
+  withLabel: highcpu { cpus = 20 }
+  
+  // Memory resources
+  withLabel: lowmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 4.GB * task.attempt } }
+  withLabel: midmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 25.GB * task.attempt } }
+  withLabel: highmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 50.GB * task.attempt } }
+  withLabel: veryhighmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 75.GB * task.attempt } }
+
+  // Disk space
+  withLabel: lowdisk {
+    disk = {process.disk ? process.disk : null}
+  }
+  withLabel: middisk {
+    disk = {process.disk ? process.disk : null}
+  }
+  withLabel: highdisk {
+    disk = {process.disk ? process.disk : null}
+  }
+  withLabel: veryhighdisk {
+    disk = {process.disk ? process.disk : null}
+  }
+  
+  // NOTE: The above labels intentionally do not have an effect by default.
+  // The user should set the disk space requirements by adding the following
+  // to the compute environment:
+  //
+  // withLabel: lowdisk { disk = { 20.GB * task.attempt } }
+  // withLabel: middisk { disk = { 100.GB * task.attempt } }
+  // withLabel: highdisk { disk = { 200.GB * task.attempt } }
+  // withLabel: veryhighdisk { disk = { 500.GB * task.attempt } }
+}
--- a/target/executable/integrate/harmonypy/setup_logger.py
+++ b/target/executable/integrate/harmonypy/setup_logger.py
@@ -0,0 +1,12 @@
+def setup_logger():
+    import logging
+    from sys import stdout
+
+    logger = logging.getLogger()
+    logger.setLevel(logging.INFO)
+    console_handler = logging.StreamHandler(stdout)
+    logFormatter = logging.Formatter("%(asctime)s %(levelname)-8s %(message)s")
+    console_handler.setFormatter(logFormatter)
+    logger.addHandler(console_handler)
+
+    return logger
--- a/target/executable/integrate/scanorama/.config.vsh.yaml
+++ b/target/executable/integrate/scanorama/.config.vsh.yaml
@@ -0,0 +1,360 @@
+name: "scanorama"
+namespace: "integrate"
+version: "main"
+authors:
+- name: "Dries De Maeyer"
+  roles:
+  - "author"
+  info:
+    role: "Core Team Member"
+    links:
+      email: "ddemaeyer@gmail.com"
+      github: "ddemaeyer"
+      linkedin: "dries-de-maeyer-b46a814"
+    organizations:
+    - name: "Janssen Pharmaceuticals"
+      href: "https://www.janssen.com"
+      role: "Principal Scientist"
+- name: "Dries Schaumont"
+  roles:
+  - "maintainer"
+  info:
+    role: "Core Team Member"
+    links:
+      email: "dries@data-intuitive.com"
+      github: "DriesSchaumont"
+      orcid: "0000-0002-4389-0440"
+      linkedin: "dries-schaumont"
+    organizations:
+    - name: "Data Intuitive"
+      href: "https://www.data-intuitive.com"
+      role: "Data Scientist"
+argument_groups:
+- name: "Arguments"
+  arguments:
+  - type: "file"
+    name: "--input"
+    alternatives:
+    - "-i"
+    description: "Input h5mu file"
+    info: null
+    must_exist: true
+    create_parent: true
+    required: true
+    direction: "input"
+    multiple: false
+    multiple_sep: ";"
+  - type: "string"
+    name: "--modality"
+    description: "Which modality from the input MuData file to process.\n"
+    info: null
+    default:
+    - "rna"
+    required: false
+    direction: "input"
+    multiple: false
+    multiple_sep: ";"
+  - type: "file"
+    name: "--output"
+    alternatives:
+    - "-o"
+    description: "Output .h5mu file"
+    info: null
+    default:
+    - "output.h5ad"
+    must_exist: true
+    create_parent: true
+    required: true
+    direction: "output"
+    multiple: false
+    multiple_sep: ";"
+  - type: "string"
+    name: "--obs_batch"
+    description: "Column name discriminating between your batches."
+    info: null
+    default:
+    - "batch"
+    required: false
+    direction: "input"
+    multiple: false
+    multiple_sep: ";"
+  - type: "string"
+    name: "--obsm_input"
+    description: "Basis obsm slot to run scanorama on."
+    info: null
+    default:
+    - "X_pca"
+    required: false
+    direction: "input"
+    multiple: false
+    multiple_sep: ";"
+  - type: "string"
+    name: "--obsm_output"
+    description: "The name of the field in adata.obsm where the integrated embeddings\
+      \ will be stored after running this function. Defaults to X_scanorama."
+    info: null
+    default:
+    - "X_scanorama"
+    required: false
+    direction: "input"
+    multiple: false
+    multiple_sep: ";"
+  - type: "integer"
+    name: "--knn"
+    description: "Number of nearest neighbors to use for matching."
+    info: null
+    default:
+    - 20
+    required: false
+    direction: "input"
+    multiple: false
+    multiple_sep: ";"
+  - type: "integer"
+    name: "--batch_size"
+    description: "The batch size used in the alignment vector computation. Useful\
+      \ when integrating very large (>100k samples) datasets. Set to large value that\
+      \ runs within available memory."
+    info: null
+    default:
+    - 5000
+    required: false
+    direction: "input"
+    multiple: false
+    multiple_sep: ";"
+  - type: "double"
+    name: "--sigma"
+    description: "Correction smoothing parameter on Gaussian kernel."
+    info: null
+    default:
+    - 15.0
+    required: false
+    direction: "input"
+    multiple: false
+    multiple_sep: ";"
+  - type: "boolean"
+    name: "--approx"
+    description: "Use approximate nearest neighbors with Python annoy; greatly speeds\
+      \ up matching runtime."
+    info: null
+    default:
+    - true
+    required: false
+    direction: "input"
+    multiple: false
+    multiple_sep: ";"
+  - type: "double"
+    name: "--alpha"
+    description: "Alignment score minimum cutoff"
+    info: null
+    default:
+    - 0.1
+    required: false
+    direction: "input"
+    multiple: false
+    multiple_sep: ";"
+  - type: "string"
+    name: "--output_compression"
+    description: "Compression format to use for the output AnnData and/or Mudata objects.\n\
+      By default no compression is applied.\n"
+    info: null
+    example:
+    - "gzip"
+    required: false
+    choices:
+    - "gzip"
+    - "lzf"
+    direction: "input"
+    multiple: false
+    multiple_sep: ";"
+resources:
+- type: "python_script"
+  path: "script.py"
+  is_executable: true
+- type: "file"
+  path: "setup_logger.py"
+- type: "file"
+  path: "compress_h5mu.py"
+- type: "file"
+  path: "nextflow_labels.config"
+  dest: "nextflow_labels.config"
+description: "Use Scanorama to integrate different experiments.\n"
+test_resources:
+- type: "python_script"
+  path: "test.py"
+  is_executable: true
+- type: "file"
+  path: "pbmc_1k_protein_v3_mms.h5mu"
+info: null
+status: "enabled"
+scope:
+  image: "public"
+  target: "public"
+license: "MIT"
+links:
+  repository: "https://github.com/openpipelines-bio/openpipeline"
+  docker_registry: "ghcr.io"
+runners:
+- type: "executable"
+  id: "executable"
+  docker_setup_strategy: "ifneedbepullelsecachedbuild"
+- type: "nextflow"
+  id: "nextflow"
+  directives:
+    label:
+    - "midcpu"
+    - "highmem"
+    - "highdisk"
+    tag: "$id"
+  auto:
+    simplifyInput: true
+    simplifyOutput: false
+    transcript: false
+    publish: false
+  config:
+    labels:
+      mem1gb: "memory = 1000000000.B"
+      mem2gb: "memory = 2000000000.B"
+      mem5gb: "memory = 5000000000.B"
+      mem10gb: "memory = 10000000000.B"
+      mem20gb: "memory = 20000000000.B"
+      mem50gb: "memory = 50000000000.B"
+      mem100gb: "memory = 100000000000.B"
+      mem200gb: "memory = 200000000000.B"
+      mem500gb: "memory = 500000000000.B"
+      mem1tb: "memory = 1000000000000.B"
+      mem2tb: "memory = 2000000000000.B"
+      mem5tb: "memory = 5000000000000.B"
+      mem10tb: "memory = 10000000000000.B"
+      mem20tb: "memory = 20000000000000.B"
+      mem50tb: "memory = 50000000000000.B"
+      mem100tb: "memory = 100000000000000.B"
+      mem200tb: "memory = 200000000000000.B"
+      mem500tb: "memory = 500000000000000.B"
+      mem1gib: "memory = 1073741824.B"
+      mem2gib: "memory = 2147483648.B"
+      mem4gib: "memory = 4294967296.B"
+      mem8gib: "memory = 8589934592.B"
+      mem16gib: "memory = 17179869184.B"
+      mem32gib: "memory = 34359738368.B"
+      mem64gib: "memory = 68719476736.B"
+      mem128gib: "memory = 137438953472.B"
+      mem256gib: "memory = 274877906944.B"
+      mem512gib: "memory = 549755813888.B"
+      mem1tib: "memory = 1099511627776.B"
+      mem2tib: "memory = 2199023255552.B"
+      mem4tib: "memory = 4398046511104.B"
+      mem8tib: "memory = 8796093022208.B"
+      mem16tib: "memory = 17592186044416.B"
+      mem32tib: "memory = 35184372088832.B"
+      mem64tib: "memory = 70368744177664.B"
+      mem128tib: "memory = 140737488355328.B"
+      mem256tib: "memory = 281474976710656.B"
+      mem512tib: "memory = 562949953421312.B"
+      cpu1: "cpus = 1"
+      cpu2: "cpus = 2"
+      cpu5: "cpus = 5"
+      cpu10: "cpus = 10"
+      cpu20: "cpus = 20"
+      cpu50: "cpus = 50"
+      cpu100: "cpus = 100"
+      cpu200: "cpus = 200"
+      cpu500: "cpus = 500"
+      cpu1000: "cpus = 1000"
+    script:
+    - "includeConfig(\"nextflow_labels.config\")"
+  debug: false
+  container: "docker"
+engines:
+- type: "docker"
+  id: "docker"
+  image: "python:3.12-slim"
+  target_registry: "images.viash-hub.com"
+  target_tag: "main"
+  namespace_separator: "/"
+  setup:
+  - type: "apt"
+    packages:
+    - "procps"
+    - "build-essential"
+    interactive: false
+  - type: "python"
+    user: false
+    packages:
+    - "anndata~=0.11.1"
+    - "mudata~=0.3.1"
+    - "scanpy~=1.10.4"
+    - "scanorama"
+    script:
+    - "exec(\"try:\\n  import awkward\\nexcept ModuleNotFoundError:\\n  exit(0)\\\
+      nelse:  exit(1)\")"
+    upgrade: true
+  test_setup:
+  - type: "apt"
+    packages:
+    - "git"
+    interactive: false
+  - type: "python"
+    user: false
+    packages:
+    - "viashpy==0.8.0"
+    github:
+    - "openpipelines-bio/core#subdirectory=packages/python/openpipeline_testutils"
+    upgrade: true
+  entrypoint: []
+  cmd: null
+- type: "native"
+  id: "native"
+build_info:
+  config: "src/integrate/scanorama/config.vsh.yaml"
+  runner: "executable"
+  engine: "docker|native"
+  output: "target/executable/integrate/scanorama"
+  executable: "target/executable/integrate/scanorama/scanorama"
+  viash_version: "0.9.4"
+  git_commit: "173327cc5670aa8bd5cf473827de80b602c90092"
+  git_remote: "https://github.com/openpipelines-bio/openpipeline"
+  git_tag: "0.2.0-2055-g173327cc"
+package_config:
+  name: "openpipeline"
+  version: "main"
+  summary: "Best-practice workflows for single-cell multi-omics analyses.\n"
+  description: "OpenPipelines are extensible single cell analysis pipelines for reproducible\
+    \ and large-scale single cell processing using [Viash](https://viash.io) and [Nextflow](https://www.nextflow.io/).\n\
+    \nIn terms of workflows, the following has been made available, but keep in mind\
+    \ that\nindividual tools and functionality can be executed as standalone components\
+    \ as well.\n\n  * Demultiplexing: conversion of raw sequencing data to FASTQ objects.\n\
+    \  * Ingestion: Read mapping and generating a count matrix.\n  * Single sample\
+    \ processing: cell filtering and doublet detection.\n  * Multisample processing:\
+    \ Count transformation, normalization, QC metric calulations.\n  * Integration:\
+    \ Clustering, integration and batch correction using single and multimodal methods.\n\
+    \  * Downstream analysis workflows\n"
+  info:
+    test_resources:
+    - type: "s3"
+      path: "s3://openpipelines-data"
+      dest: "resources_test"
+    nextflow_labels_ci:
+    - path: "src/workflows/utils/labels_ci.config"
+      description: "Adds the correct memory and CPU labels when running on the Viash\
+        \ Hub CI."
+  viash_version: "0.9.4"
+  source: "src"
+  target: "target"
+  config_mods:
+  - ".resources += {path: '/src/workflows/utils/labels.config', dest: 'nextflow_labels.config'}\n\
+    .runners[.type == 'nextflow'].config.script := 'includeConfig(\"nextflow_labels.config\"\
+    )'\n"
+  - ".engines += { type: \"native\" }"
+  - ".engines[.type == 'docker'].target_registry := 'images.viash-hub.com'"
+  - ".engines[.type == 'docker'].target_tag := 'main'"
+  keywords:
+  - "single-cell"
+  - "multimodal"
+  license: "MIT"
+  organization: "vsh"
+  links:
+    repository: "https://github.com/openpipelines-bio/openpipeline"
+    docker_registry: "ghcr.io"
+    homepage: "https://openpipelines.bio"
+    documentation: "https://openpipelines.bio/fundamentals"
+    issue_tracker: "https://github.com/openpipelines-bio/openpipeline/issues"
--- a/target/executable/integrate/scanorama/compress_h5mu.py
+++ b/target/executable/integrate/scanorama/compress_h5mu.py
@@ -0,0 +1,87 @@
+import shutil
+from anndata import AnnData
+from mudata import write_h5ad
+from h5py import File as H5File
+from h5py import Group, Dataset
+from pathlib import Path
+from typing import Union, Literal
+from functools import partial
+
+
+def compress_h5mu(
+    input_path: Union[str, Path],
+    output_path: Union[str, Path],
+    compression: Union[Literal["gzip"], Literal["lzf"]],
+):
+    input_path, output_path = str(input_path), str(output_path)
+
+    def copy_attributes(in_object, out_object):
+        for key, value in in_object.attrs.items():
+            out_object.attrs[key] = value
+
+    def visit_path(
+        output_h5: H5File,
+        compression: Union[Literal["gzip"], Literal["lzf"]],
+        name: str,
+        object: Union[Group, Dataset],
+    ):
+        if isinstance(object, Group):
+            new_group = output_h5.create_group(name)
+            copy_attributes(object, new_group)
+        elif isinstance(object, Dataset):
+            # Compression only works for non-scalar Dataset objects
+            # Scalar objects dont have a shape defined
+            if not object.compression and object.shape not in [None, ()]:
+                new_dataset = output_h5.create_dataset(
+                    name, data=object, compression=compression
+                )
+                copy_attributes(object, new_dataset)
+            else:
+                output_h5.copy(object, name)
+        else:
+            raise NotImplementedError(
+                f"Could not copy element {name}, "
+                f"type has not been implemented yet: {type(object)}"
+            )
+
+    with (
+        H5File(input_path, "r") as input_h5,
+        H5File(output_path, "w", userblock_size=512) as output_h5,
+    ):
+        copy_attributes(input_h5, output_h5)
+        input_h5.visititems(partial(visit_path, output_h5, compression))
+
+    with open(input_path, "rb") as input_bytes:
+        # Mudata puts metadata like this in the first 512 bytes:
+        # MuData (format-version=0.1.0;creator=muon;creator-version=0.2.0)
+        # See mudata/_core/io.py, read_h5mu() function
+        starting_metadata = input_bytes.read(100)
+        # The metadata is padded with extra null bytes up until 512 bytes
+        truncate_location = starting_metadata.find(b"\x00")
+        starting_metadata = starting_metadata[:truncate_location]
+    with open(output_path, "br+") as f:
+        nbytes = f.write(starting_metadata)
+        f.write(b"\0" * (512 - nbytes))
+
+
+def write_h5ad_to_h5mu_with_compression(
+    output_file: Union[str, Path],
+    h5mu: Union[str, Path],
+    modality_name: str,
+    modality_data: AnnData,
+    output_compression=None,
+):
+    output_file = Path(output_file)
+    h5mu = Path(h5mu)
+    output_file_uncompressed = (
+        output_file.with_name(output_file.stem + "_uncompressed.h5mu")
+        if output_compression
+        else output_file
+    )
+    shutil.copyfile(h5mu, output_file_uncompressed)
+    write_h5ad(filename=output_file_uncompressed, mod=modality_name, data=modality_data)
+    if output_compression:
+        compress_h5mu(
+            output_file_uncompressed, output_file, compression=output_compression
+        )
+        output_file_uncompressed.unlink()
--- a/target/executable/integrate/scanorama/nextflow_labels.config
+++ b/target/executable/integrate/scanorama/nextflow_labels.config
@@ -0,0 +1,48 @@
+process {
+  // Default resources for components that hardly do any processing
+  memory = { 2.GB * task.attempt }
+  cpus = 1
+
+  // Retry for exit codes that have something to do with memory issues
+  errorStrategy = { task.exitStatus in 137..140 ? 'retry' : 'terminate' }
+  maxRetries = 3
+
+  // The memory a task is assinged increases with each attempt
+  // uncomment the line below and adjust the value to set a global upper limit on the memory.
+  // resourceLimits = [ memory: 240.Gb ] 
+
+  // CPU resources
+  withLabel: singlecpu { cpus = 1 }
+  withLabel: lowcpu { cpus = 4 }
+  withLabel: midcpu { cpus = 10 }
+  withLabel: highcpu { cpus = 20 }
+  
+  // Memory resources
+  withLabel: lowmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 4.GB * task.attempt } }
+  withLabel: midmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 25.GB * task.attempt } }
+  withLabel: highmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 50.GB * task.attempt } }
+  withLabel: veryhighmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 75.GB * task.attempt } }
+
+  // Disk space
+  withLabel: lowdisk {
+    disk = {process.disk ? process.disk : null}
+  }
+  withLabel: middisk {
+    disk = {process.disk ? process.disk : null}
+  }
+  withLabel: highdisk {
+    disk = {process.disk ? process.disk : null}
+  }
+  withLabel: veryhighdisk {
+    disk = {process.disk ? process.disk : null}
+  }
+  
+  // NOTE: The above labels intentionally do not have an effect by default.
+  // The user should set the disk space requirements by adding the following
+  // to the compute environment:
+  //
+  // withLabel: lowdisk { disk = { 20.GB * task.attempt } }
+  // withLabel: middisk { disk = { 100.GB * task.attempt } }
+  // withLabel: highdisk { disk = { 200.GB * task.attempt } }
+  // withLabel: veryhighdisk { disk = { 500.GB * task.attempt } }
+}
--- a/target/executable/integrate/scanorama/scanorama
+++ b/target/executable/integrate/scanorama/scanorama
--- a/target/executable/integrate/scanorama/setup_logger.py
+++ b/target/executable/integrate/scanorama/setup_logger.py
@@ -0,0 +1,12 @@
+def setup_logger():
+    import logging
+    from sys import stdout
+
+    logger = logging.getLogger()
+    logger.setLevel(logging.INFO)
+    console_handler = logging.StreamHandler(stdout)
+    logFormatter = logging.Formatter("%(asctime)s %(levelname)-8s %(message)s")
+    console_handler.setFormatter(logFormatter)
+    logger.addHandler(console_handler)
+
+    return logger
--- a/target/executable/integrate/scarches/.config.vsh.yaml
+++ b/target/executable/integrate/scarches/.config.vsh.yaml
@@ -0,0 +1,482 @@
+name: "scarches"
+namespace: "integrate"
+version: "main"
+authors:
+- name: "Vladimir Shitov"
+  roles:
+  - "author"
+  info:
+    role: "Contributor"
+    links:
+      email: "vladimir.shitov@helmholtz-muenchen.de"
+      github: "vladimirshitov"
+      orcid: "0000-0002-1960-8812"
+      linkedin: "vladimir-shitov-9a659513b"
+    organizations:
+    - name: "Helmholtz Munich"
+      href: "https://www.helmholtz-munich.de"
+      role: "PhD Candidate"
+- name: "Dorien Roosen"
+  roles:
+  - "maintainer"
+  info:
+    role: "Core Team Member"
+    links:
+      email: "dorien@data-intuitive.com"
+      github: "dorien-er"
+      linkedin: "dorien-roosen"
+    organizations:
+    - name: "Data Intuitive"
+      href: "https://www.data-intuitive.com"
+      role: "Data Scientist"
+argument_groups:
+- name: "Inputs"
+  description: "Arguments related to the input (query) dataset"
+  arguments:
+  - type: "file"
+    name: "--input"
+    alternatives:
+    - "-i"
+    description: "Input h5mu file to use as a query"
+    info: null
+    must_exist: true
+    create_parent: true
+    required: true
+    direction: "input"
+    multiple: false
+    multiple_sep: ";"
+  - type: "string"
+    name: "--layer"
+    description: "Layer to be used for scArches, if .X is not to be used."
+    info: null
+    required: false
+    direction: "input"
+    multiple: false
+    multiple_sep: ";"
+  - type: "string"
+    name: "--modality"
+    description: "Which modality from the input MuData file to process.\n"
+    info: null
+    default:
+    - "rna"
+    required: false
+    direction: "input"
+    multiple: false
+    multiple_sep: ";"
+  - type: "string"
+    name: "--input_obs_batch"
+    description: "Name of the .obs column with batch information."
+    info: null
+    required: false
+    direction: "input"
+    multiple: false
+    multiple_sep: ";"
+  - type: "string"
+    name: "--input_obs_label"
+    description: "Name of the .obs column with celltype information."
+    info: null
+    required: false
+    direction: "input"
+    multiple: false
+    multiple_sep: ";"
+  - type: "string"
+    name: "--input_var_gene_names"
+    description: "Name of the .var column with gene names, if the var .index is not\
+      \ to be used."
+    info: null
+    required: false
+    direction: "input"
+    multiple: false
+    multiple_sep: ";"
+  - type: "string"
+    name: "--input_obs_size_factor"
+    description: "Key in adata.obs for size factor information. Instead of using library\
+      \ size as a size factor,\nthe provided size factor column will be used as offset\
+      \ in the mean of the likelihood.\nAssumed to be on linear scale.\n"
+    info: null
+    required: false
+    direction: "input"
+    multiple: false
+    multiple_sep: ";"
+- name: "Reference"
+  arguments:
+  - type: "file"
+    name: "--reference"
+    alternatives:
+    - "-r"
+    description: "Path to the directory with reference model or a web link."
+    info: null
+    must_exist: true
+    create_parent: true
+    required: true
+    direction: "input"
+    multiple: false
+    multiple_sep: ";"
+  - type: "string"
+    name: "--reference_class"
+    description: "For legacy models; the type of model (where the type of model was\
+      \ not saved with it; e.g. when they were generated with scvi-tools versions\
+      \ < 0.15)."
+    info: null
+    example:
+    - "SCANVI"
+    required: false
+    direction: "input"
+    multiple: false
+    multiple_sep: ";"
+- name: "Outputs"
+  arguments:
+  - type: "file"
+    name: "--output"
+    alternatives:
+    - "-o"
+    description: "Output h5mu file."
+    info: null
+    must_exist: true
+    create_parent: true
+    required: true
+    direction: "output"
+    multiple: false
+    multiple_sep: ";"
+  - type: "file"
+    name: "--model_output"
+    description: "Output directory for model"
+    info: null
+    default:
+    - "model"
+    must_exist: true
+    create_parent: true
+    required: false
+    direction: "output"
+    multiple: false
+    multiple_sep: ";"
+  - type: "string"
+    name: "--obsm_output"
+    description: "In which .obsm slot to store the resulting integrated embedding."
+    info: null
+    default:
+    - "X_integrated_scanvi"
+    required: false
+    direction: "input"
+    multiple: false
+    multiple_sep: ";"
+  - type: "string"
+    name: "--obs_output_predictions"
+    description: "In which .obs slot to store the resulting label predictions. Only\
+      \ relevant if a scANVI model was provided."
+    info: null
+    default:
+    - "scanvi_pred"
+    required: false
+    direction: "input"
+    multiple: false
+    multiple_sep: ";"
+  - type: "string"
+    name: "--obs_output_probabilities"
+    description: "In which .obs slot to store the probabilities of the label predictions.\
+      \ Only relevant if a scANVI model was provided."
+    info: null
+    default:
+    - "scanvi_proba"
+    required: false
+    direction: "input"
+    multiple: false
+    multiple_sep: ";"
+  - type: "string"
+    name: "--output_compression"
+    description: "Compression format to use for the output AnnData and/or Mudata objects.\n\
+      By default no compression is applied.\n"
+    info: null
+    example:
+    - "gzip"
+    required: false
+    choices:
+    - "gzip"
+    - "lzf"
+    direction: "input"
+    multiple: false
+    multiple_sep: ";"
+- name: "Early stopping arguments"
+  arguments:
+  - type: "boolean"
+    name: "--early_stopping"
+    description: "Whether to perform early stopping with respect to the validation\
+      \ set."
+    info: null
+    required: false
+    direction: "input"
+    multiple: false
+    multiple_sep: ";"
+  - type: "string"
+    name: "--early_stopping_monitor"
+    description: "Metric logged during validation set epoch."
+    info: null
+    default:
+    - "elbo_validation"
+    required: false
+    choices:
+    - "elbo_validation"
+    - "reconstruction_loss_validation"
+    - "kl_local_validation"
+    direction: "input"
+    multiple: false
+    multiple_sep: ";"
+  - type: "integer"
+    name: "--early_stopping_patience"
+    description: "Number of validation epochs with no improvement after which training\
+      \ will be stopped."
+    info: null
+    default:
+    - 45
+    required: false
+    min: 1
+    direction: "input"
+    multiple: false
+    multiple_sep: ";"
+  - type: "double"
+    name: "--early_stopping_min_delta"
+    description: "Minimum change in the monitored quantity to qualify as an improvement,\
+      \ i.e. an absolute change of less than min_delta, will count as no improvement."
+    info: null
+    default:
+    - 0.0
+    required: false
+    min: 0.0
+    direction: "input"
+    multiple: false
+    multiple_sep: ";"
+- name: "Learning parameters"
+  arguments:
+  - type: "integer"
+    name: "--max_epochs"
+    description: "Number of passes through the dataset, defaults to (20000 / number\
+      \ of cells) * 400 or 400; whichever is smallest."
+    info: null
+    required: false
+    direction: "input"
+    multiple: false
+    multiple_sep: ";"
+  - type: "boolean"
+    name: "--reduce_lr_on_plateau"
+    description: "Whether to monitor validation loss and reduce learning rate when\
+      \ validation set `lr_scheduler_metric` plateaus."
+    info: null
+    default:
+    - true
+    required: false
+    direction: "input"
+    multiple: false
+    multiple_sep: ";"
+  - type: "double"
+    name: "--lr_factor"
+    description: "Factor to reduce learning rate."
+    info: null
+    default:
+    - 0.6
+    required: false
+    min: 0.0
+    direction: "input"
+    multiple: false
+    multiple_sep: ";"
+  - type: "double"
+    name: "--lr_patience"
+    description: "Number of epochs with no improvement after which learning rate will\
+      \ be reduced."
+    info: null
+    default:
+    - 30.0
+    required: false
+    min: 0.0
+    direction: "input"
+    multiple: false
+    multiple_sep: ";"
+resources:
+- type: "python_script"
+  path: "script.py"
+  is_executable: true
+- type: "file"
+  path: "setup_logger.py"
+- type: "file"
+  path: "compress_h5mu.py"
+- type: "file"
+  path: "set_var_index.py"
+- type: "file"
+  path: "nextflow_labels.config"
+  dest: "nextflow_labels.config"
+description: "Performs reference mapping with scArches"
+test_resources:
+- type: "python_script"
+  path: "test.py"
+  is_executable: true
+- type: "file"
+  path: "scanvi_model"
+- type: "file"
+  path: "scvi_model"
+- type: "file"
+  path: "pbmc_1k_protein_v3_mms.h5mu"
+- type: "file"
+  path: "HLCA_reference_model.zip"
+info: null
+status: "enabled"
+scope:
+  image: "public"
+  target: "public"
+license: "MIT"
+links:
+  repository: "https://github.com/openpipelines-bio/openpipeline"
+  docker_registry: "ghcr.io"
+runners:
+- type: "executable"
+  id: "executable"
+  docker_setup_strategy: "ifneedbepullelsecachedbuild"
+- type: "nextflow"
+  id: "nextflow"
+  directives:
+    label:
+    - "highmem"
+    - "highcpu"
+    - "highdisk"
+    - "gpu"
+    tag: "$id"
+  auto:
+    simplifyInput: true
+    simplifyOutput: false
+    transcript: false
+    publish: false
+  config:
+    labels:
+      mem1gb: "memory = 1000000000.B"
+      mem2gb: "memory = 2000000000.B"
+      mem5gb: "memory = 5000000000.B"
+      mem10gb: "memory = 10000000000.B"
+      mem20gb: "memory = 20000000000.B"
+      mem50gb: "memory = 50000000000.B"
+      mem100gb: "memory = 100000000000.B"
+      mem200gb: "memory = 200000000000.B"
+      mem500gb: "memory = 500000000000.B"
+      mem1tb: "memory = 1000000000000.B"
+      mem2tb: "memory = 2000000000000.B"
+      mem5tb: "memory = 5000000000000.B"
+      mem10tb: "memory = 10000000000000.B"
+      mem20tb: "memory = 20000000000000.B"
+      mem50tb: "memory = 50000000000000.B"
+      mem100tb: "memory = 100000000000000.B"
+      mem200tb: "memory = 200000000000000.B"
+      mem500tb: "memory = 500000000000000.B"
+      mem1gib: "memory = 1073741824.B"
+      mem2gib: "memory = 2147483648.B"
+      mem4gib: "memory = 4294967296.B"
+      mem8gib: "memory = 8589934592.B"
+      mem16gib: "memory = 17179869184.B"
+      mem32gib: "memory = 34359738368.B"
+      mem64gib: "memory = 68719476736.B"
+      mem128gib: "memory = 137438953472.B"
+      mem256gib: "memory = 274877906944.B"
+      mem512gib: "memory = 549755813888.B"
+      mem1tib: "memory = 1099511627776.B"
+      mem2tib: "memory = 2199023255552.B"
+      mem4tib: "memory = 4398046511104.B"
+      mem8tib: "memory = 8796093022208.B"
+      mem16tib: "memory = 17592186044416.B"
+      mem32tib: "memory = 35184372088832.B"
+      mem64tib: "memory = 70368744177664.B"
+      mem128tib: "memory = 140737488355328.B"
+      mem256tib: "memory = 281474976710656.B"
+      mem512tib: "memory = 562949953421312.B"
+      cpu1: "cpus = 1"
+      cpu2: "cpus = 2"
+      cpu5: "cpus = 5"
+      cpu10: "cpus = 10"
+      cpu20: "cpus = 20"
+      cpu50: "cpus = 50"
+      cpu100: "cpus = 100"
+      cpu200: "cpus = 200"
+      cpu500: "cpus = 500"
+      cpu1000: "cpus = 1000"
+    script:
+    - "includeConfig(\"nextflow_labels.config\")"
+  debug: false
+  container: "docker"
+engines:
+- type: "docker"
+  id: "docker"
+  image: "nvcr.io/nvidia/pytorch:25.05-py3"
+  target_registry: "images.viash-hub.com"
+  target_tag: "main"
+  namespace_separator: "/"
+  setup:
+  - type: "python"
+    user: false
+    packages:
+    - "anndata~=0.11.1"
+    - "mudata~=0.3.1"
+    - "jax[cuda]"
+    - "scvi-tools~=1.3.1"
+    script:
+    - "exec(\"try:\\n  import awkward\\nexcept ModuleNotFoundError:\\n  exit(0)\\\
+      nelse:  exit(1)\")"
+    upgrade: true
+  test_setup:
+  - type: "python"
+    user: false
+    packages:
+    - "viashpy==0.8.0"
+    upgrade: true
+  entrypoint: []
+  cmd: null
+- type: "native"
+  id: "native"
+build_info:
+  config: "src/integrate/scarches/config.vsh.yaml"
+  runner: "executable"
+  engine: "docker|native"
+  output: "target/executable/integrate/scarches"
+  executable: "target/executable/integrate/scarches/scarches"
+  viash_version: "0.9.4"
+  git_commit: "173327cc5670aa8bd5cf473827de80b602c90092"
+  git_remote: "https://github.com/openpipelines-bio/openpipeline"
+  git_tag: "0.2.0-2055-g173327cc"
+package_config:
+  name: "openpipeline"
+  version: "main"
+  summary: "Best-practice workflows for single-cell multi-omics analyses.\n"
+  description: "OpenPipelines are extensible single cell analysis pipelines for reproducible\
+    \ and large-scale single cell processing using [Viash](https://viash.io) and [Nextflow](https://www.nextflow.io/).\n\
+    \nIn terms of workflows, the following has been made available, but keep in mind\
+    \ that\nindividual tools and functionality can be executed as standalone components\
+    \ as well.\n\n  * Demultiplexing: conversion of raw sequencing data to FASTQ objects.\n\
+    \  * Ingestion: Read mapping and generating a count matrix.\n  * Single sample\
+    \ processing: cell filtering and doublet detection.\n  * Multisample processing:\
+    \ Count transformation, normalization, QC metric calulations.\n  * Integration:\
+    \ Clustering, integration and batch correction using single and multimodal methods.\n\
+    \  * Downstream analysis workflows\n"
+  info:
+    test_resources:
+    - type: "s3"
+      path: "s3://openpipelines-data"
+      dest: "resources_test"
+    nextflow_labels_ci:
+    - path: "src/workflows/utils/labels_ci.config"
+      description: "Adds the correct memory and CPU labels when running on the Viash\
+        \ Hub CI."
+  viash_version: "0.9.4"
+  source: "src"
+  target: "target"
+  config_mods:
+  - ".resources += {path: '/src/workflows/utils/labels.config', dest: 'nextflow_labels.config'}\n\
+    .runners[.type == 'nextflow'].config.script := 'includeConfig(\"nextflow_labels.config\"\
+    )'\n"
+  - ".engines += { type: \"native\" }"
+  - ".engines[.type == 'docker'].target_registry := 'images.viash-hub.com'"
+  - ".engines[.type == 'docker'].target_tag := 'main'"
+  keywords:
+  - "single-cell"
+  - "multimodal"
+  license: "MIT"
+  organization: "vsh"
+  links:
+    repository: "https://github.com/openpipelines-bio/openpipeline"
+    docker_registry: "ghcr.io"
+    homepage: "https://openpipelines.bio"
+    documentation: "https://openpipelines.bio/fundamentals"
+    issue_tracker: "https://github.com/openpipelines-bio/openpipeline/issues"
--- a/target/executable/integrate/scarches/compress_h5mu.py
+++ b/target/executable/integrate/scarches/compress_h5mu.py
@@ -0,0 +1,87 @@
+import shutil
+from anndata import AnnData
+from mudata import write_h5ad
+from h5py import File as H5File
+from h5py import Group, Dataset
+from pathlib import Path
+from typing import Union, Literal
+from functools import partial
+
+
+def compress_h5mu(
+    input_path: Union[str, Path],
+    output_path: Union[str, Path],
+    compression: Union[Literal["gzip"], Literal["lzf"]],
+):
+    input_path, output_path = str(input_path), str(output_path)
+
+    def copy_attributes(in_object, out_object):
+        for key, value in in_object.attrs.items():
+            out_object.attrs[key] = value
+
+    def visit_path(
+        output_h5: H5File,
+        compression: Union[Literal["gzip"], Literal["lzf"]],
+        name: str,
+        object: Union[Group, Dataset],
+    ):
+        if isinstance(object, Group):
+            new_group = output_h5.create_group(name)
+            copy_attributes(object, new_group)
+        elif isinstance(object, Dataset):
+            # Compression only works for non-scalar Dataset objects
+            # Scalar objects dont have a shape defined
+            if not object.compression and object.shape not in [None, ()]:
+                new_dataset = output_h5.create_dataset(
+                    name, data=object, compression=compression
+                )
+                copy_attributes(object, new_dataset)
+            else:
+                output_h5.copy(object, name)
+        else:
+            raise NotImplementedError(
+                f"Could not copy element {name}, "
+                f"type has not been implemented yet: {type(object)}"
+            )
+
+    with (
+        H5File(input_path, "r") as input_h5,
+        H5File(output_path, "w", userblock_size=512) as output_h5,
+    ):
+        copy_attributes(input_h5, output_h5)
+        input_h5.visititems(partial(visit_path, output_h5, compression))
+
+    with open(input_path, "rb") as input_bytes:
+        # Mudata puts metadata like this in the first 512 bytes:
+        # MuData (format-version=0.1.0;creator=muon;creator-version=0.2.0)
+        # See mudata/_core/io.py, read_h5mu() function
+        starting_metadata = input_bytes.read(100)
+        # The metadata is padded with extra null bytes up until 512 bytes
+        truncate_location = starting_metadata.find(b"\x00")
+        starting_metadata = starting_metadata[:truncate_location]
+    with open(output_path, "br+") as f:
+        nbytes = f.write(starting_metadata)
+        f.write(b"\0" * (512 - nbytes))
+
+
+def write_h5ad_to_h5mu_with_compression(
+    output_file: Union[str, Path],
+    h5mu: Union[str, Path],
+    modality_name: str,
+    modality_data: AnnData,
+    output_compression=None,
+):
+    output_file = Path(output_file)
+    h5mu = Path(h5mu)
+    output_file_uncompressed = (
+        output_file.with_name(output_file.stem + "_uncompressed.h5mu")
+        if output_compression
+        else output_file
+    )
+    shutil.copyfile(h5mu, output_file_uncompressed)
+    write_h5ad(filename=output_file_uncompressed, mod=modality_name, data=modality_data)
+    if output_compression:
+        compress_h5mu(
+            output_file_uncompressed, output_file, compression=output_compression
+        )
+        output_file_uncompressed.unlink()
--- a/target/executable/integrate/scarches/nextflow_labels.config
+++ b/target/executable/integrate/scarches/nextflow_labels.config
@@ -0,0 +1,48 @@
+process {
+  // Default resources for components that hardly do any processing
+  memory = { 2.GB * task.attempt }
+  cpus = 1
+
+  // Retry for exit codes that have something to do with memory issues
+  errorStrategy = { task.exitStatus in 137..140 ? 'retry' : 'terminate' }
+  maxRetries = 3
+
+  // The memory a task is assinged increases with each attempt
+  // uncomment the line below and adjust the value to set a global upper limit on the memory.
+  // resourceLimits = [ memory: 240.Gb ] 
+
+  // CPU resources
+  withLabel: singlecpu { cpus = 1 }
+  withLabel: lowcpu { cpus = 4 }
+  withLabel: midcpu { cpus = 10 }
+  withLabel: highcpu { cpus = 20 }
+  
+  // Memory resources
+  withLabel: lowmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 4.GB * task.attempt } }
+  withLabel: midmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 25.GB * task.attempt } }
+  withLabel: highmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 50.GB * task.attempt } }
+  withLabel: veryhighmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 75.GB * task.attempt } }
+
+  // Disk space
+  withLabel: lowdisk {
+    disk = {process.disk ? process.disk : null}
+  }
+  withLabel: middisk {
+    disk = {process.disk ? process.disk : null}
+  }
+  withLabel: highdisk {
+    disk = {process.disk ? process.disk : null}
+  }
+  withLabel: veryhighdisk {
+    disk = {process.disk ? process.disk : null}
+  }
+  
+  // NOTE: The above labels intentionally do not have an effect by default.
+  // The user should set the disk space requirements by adding the following
+  // to the compute environment:
+  //
+  // withLabel: lowdisk { disk = { 20.GB * task.attempt } }
+  // withLabel: middisk { disk = { 100.GB * task.attempt } }
+  // withLabel: highdisk { disk = { 200.GB * task.attempt } }
+  // withLabel: veryhighdisk { disk = { 500.GB * task.attempt } }
+}
--- a/target/executable/integrate/scarches/scarches
+++ b/target/executable/integrate/scarches/scarches
--- a/target/executable/integrate/scarches/set_var_index.py
+++ b/target/executable/integrate/scarches/set_var_index.py
@@ -0,0 +1,24 @@
+import anndata as ad
+import re
+
+
+def set_var_index(adata: ad.AnnData, var_name: str | None = None) -> ad.AnnData:
+    """Sanitize gene names and set the index of the .var DataFrame.
+
+    Parameters
+    ----------
+    adata : AnnData
+        Annotated data object
+    var_name : str | None
+        Name of the column in `adata.var` that contains the gene names, if None, the existing index will be sanitized but not replaced.
+
+    Returns
+    -------
+    AnnData
+        Copy of `adata` with sanitized and replaced index
+    """
+    if var_name:
+        adata.var.index = [re.sub("\\.[0-9]+$", "", s) for s in adata.var[var_name]]
+    else:
+        adata.var.index = [re.sub("\\.[0-9]+$", "", s) for s in adata.var.index]
+    return adata
--- a/target/executable/integrate/scarches/setup_logger.py
+++ b/target/executable/integrate/scarches/setup_logger.py
@@ -0,0 +1,12 @@
+def setup_logger():
+    import logging
+    from sys import stdout
+
+    logger = logging.getLogger()
+    logger.setLevel(logging.INFO)
+    console_handler = logging.StreamHandler(stdout)
+    logFormatter = logging.Formatter("%(asctime)s %(levelname)-8s %(message)s")
+    console_handler.setFormatter(logFormatter)
+    logger.addHandler(console_handler)
+
+    return logger
--- a/target/executable/integrate/scvi/.config.vsh.yaml
+++ b/target/executable/integrate/scvi/.config.vsh.yaml
@@ -0,0 +1,638 @@
+name: "scvi"
+namespace: "integrate"
+version: "main"
+authors:
+- name: "Malte D. Luecken"
+  roles:
+  - "author"
+  info:
+    role: "Core Team Member"
+    links:
+      email: "malte.luecken@helmholtz-muenchen.de"
+      github: "LuckyMD"
+      orcid: "0000-0001-7464-7921"
+      linkedin: "malte-l%C3%BCcken-b8b21049"
+      twitter: "MDLuecken"
+    organizations:
+    - name: "Helmholtz Munich"
+      href: "https://www.helmholtz-munich.de"
+      role: "Group Leader"
+    - name: "Open Problems"
+      href: "https://openproblems.bio"
+      role: "Core Member"
+- name: "Dries Schaumont"
+  roles:
+  - "maintainer"
+  info:
+    role: "Core Team Member"
+    links:
+      email: "dries@data-intuitive.com"
+      github: "DriesSchaumont"
+      orcid: "0000-0002-4389-0440"
+      linkedin: "dries-schaumont"
+    organizations:
+    - name: "Data Intuitive"
+      href: "https://www.data-intuitive.com"
+      role: "Data Scientist"
+- name: "Matthias Beyens"
+  roles:
+  - "contributor"
+  info:
+    role: "Contributor"
+    links:
+      github: "MatthiasBeyens"
+      orcid: "0000-0003-3304-0706"
+      email: "matthias.beyens@gmail.com"
+      linkedin: "mbeyens"
+    organizations:
+    - name: "Janssen Pharmaceuticals"
+      href: "https://www.janssen.com"
+      role: "Principal Scientist"
+argument_groups:
+- name: "Inputs"
+  arguments:
+  - type: "file"
+    name: "--input"
+    alternatives:
+    - "-i"
+    description: "Input h5mu file"
+    info: null
+    must_exist: true
+    create_parent: true
+    required: true
+    direction: "input"
+    multiple: false
+    multiple_sep: ";"
+  - type: "string"
+    name: "--modality"
+    description: "Which modality from the input MuData file to process.\n"
+    info: null
+    default:
+    - "rna"
+    required: false
+    direction: "input"
+    multiple: false
+    multiple_sep: ";"
+  - type: "string"
+    name: "--input_layer"
+    description: "Input layer to use. If None, X is used"
+    info: null
+    required: false
+    direction: "input"
+    multiple: false
+    multiple_sep: ";"
+  - type: "string"
+    name: "--obs_batch"
+    description: "Column name discriminating between your batches."
+    info: null
+    default:
+    - "sample_id"
+    required: false
+    direction: "input"
+    multiple: false
+    multiple_sep: ";"
+  - type: "string"
+    name: "--var_gene_names"
+    description: ".var column containing gene names. By default, use the index."
+    info: null
+    required: false
+    direction: "input"
+    multiple: false
+    multiple_sep: ";"
+  - type: "string"
+    name: "--var_input"
+    description: ".var column containing highly variable genes. By default, do not\
+      \ subset genes."
+    info: null
+    required: false
+    direction: "input"
+    multiple: false
+    multiple_sep: ";"
+  - type: "string"
+    name: "--obs_labels"
+    description: "Key in adata.obs for label information. Categories will automatically\
+      \ be \nconverted into integer categories and saved to adata.obs['_scvi_labels'].\n\
+      If None, assigns the same label to all the data.\n"
+    info: null
+    required: false
+    direction: "input"
+    multiple: false
+    multiple_sep: ";"
+  - type: "string"
+    name: "--obs_size_factor"
+    description: "Key in adata.obs for size factor information. Instead of using library\
+      \ size as a size factor,\nthe provided size factor column will be used as offset\
+      \ in the mean of the likelihood.\nAssumed to be on linear scale.\n"
+    info: null
+    required: false
+    direction: "input"
+    multiple: false
+    multiple_sep: ";"
+  - type: "string"
+    name: "--obs_categorical_covariate"
+    description: "Keys in adata.obs that correspond to categorical data. These covariates\
+      \ can be added in\naddition to the batch covariate and are also treated as nuisance\
+      \ factors\n(i.e., the model tries to minimize their effects on the latent space).\n\
+      Thus, these should not be used for biologically-relevant factors that you do\
+      \ _not_ want to correct for.\n"
+    info: null
+    required: false
+    direction: "input"
+    multiple: true
+    multiple_sep: ";"
+  - type: "string"
+    name: "--obs_continuous_covariate"
+    description: "Keys in adata.obs that correspond to continuous data. These covariates\
+      \ can be added in\naddition to the batch covariate and are also treated as nuisance\
+      \ factors\n(i.e., the model tries to minimize their effects on the latent space).\
+      \ Thus, these should not be\nused for biologically-relevant factors that you\
+      \ do _not_ want to correct for.\n"
+    info: null
+    required: false
+    direction: "input"
+    multiple: true
+    multiple_sep: ";"
+- name: "Outputs"
+  arguments:
+  - type: "file"
+    name: "--output"
+    alternatives:
+    - "-o"
+    description: "Output h5mu file."
+    info: null
+    must_exist: true
+    create_parent: true
+    required: true
+    direction: "output"
+    multiple: false
+    multiple_sep: ";"
+  - type: "file"
+    name: "--output_model"
+    description: "Folder where the state of the trained model will be saved to."
+    info: null
+    must_exist: true
+    create_parent: true
+    required: false
+    direction: "output"
+    multiple: false
+    multiple_sep: ";"
+  - type: "string"
+    name: "--obsm_output"
+    description: "In which .obsm slot to store the resulting integrated embedding."
+    info: null
+    default:
+    - "X_scvi_integrated"
+    required: false
+    direction: "input"
+    multiple: false
+    multiple_sep: ";"
+  - type: "string"
+    name: "--output_compression"
+    description: "Compression format to use for the output AnnData and/or Mudata objects.\n\
+      By default no compression is applied.\n"
+    info: null
+    example:
+    - "gzip"
+    required: false
+    choices:
+    - "gzip"
+    - "lzf"
+    direction: "input"
+    multiple: false
+    multiple_sep: ";"
+- name: "SCVI options"
+  arguments:
+  - type: "integer"
+    name: "--n_hidden_nodes"
+    description: "Number of nodes per hidden layer."
+    info: null
+    default:
+    - 128
+    required: false
+    direction: "input"
+    multiple: false
+    multiple_sep: ";"
+  - type: "integer"
+    name: "--n_dimensions_latent_space"
+    description: "Dimensionality of the latent space."
+    info: null
+    default:
+    - 30
+    required: false
+    direction: "input"
+    multiple: false
+    multiple_sep: ";"
+  - type: "integer"
+    name: "--n_hidden_layers"
+    description: "Number of hidden layers used for encoder and decoder neural-networks."
+    info: null
+    default:
+    - 2
+    required: false
+    direction: "input"
+    multiple: false
+    multiple_sep: ";"
+  - type: "double"
+    name: "--dropout_rate"
+    description: "Dropout rate for the neural networks."
+    info: null
+    default:
+    - 0.1
+    required: false
+    direction: "input"
+    multiple: false
+    multiple_sep: ";"
+  - type: "string"
+    name: "--dispersion"
+    description: "Set the behavior for the dispersion for negative binomial distributions:\n\
+      - gene: dispersion parameter of negative binomial is constant per gene across\
+      \ cells\n- gene-batch: dispersion can differ between different batches\n- gene-label:\
+      \ dispersion can differ between different labels\n- gene-cell:  dispersion can\
+      \ differ for every gene in every cell\n"
+    info: null
+    default:
+    - "gene"
+    required: false
+    choices:
+    - "gene"
+    - "gene-batch"
+    - "gene-label"
+    - "gene-cell"
+    direction: "input"
+    multiple: false
+    multiple_sep: ";"
+  - type: "string"
+    name: "--gene_likelihood"
+    description: "Model used to generate the expression data from a count-based likelihood\
+      \ distribution.\n- nb: Negative binomial distribution\n- zinb: Zero-inflated\
+      \ negative binomial distribution\n- poisson: Poisson distribution\n"
+    info: null
+    default:
+    - "nb"
+    required: false
+    choices:
+    - "nb"
+    - "zinb"
+    - "poisson"
+    direction: "input"
+    multiple: false
+    multiple_sep: ";"
+- name: "Variational auto-encoder model options"
+  arguments:
+  - type: "string"
+    name: "--use_layer_normalization"
+    description: "Neural networks for which to enable layer normalization. \n"
+    info: null
+    default:
+    - "both"
+    required: false
+    choices:
+    - "encoder"
+    - "decoder"
+    - "none"
+    - "both"
+    direction: "input"
+    multiple: false
+    multiple_sep: ";"
+  - type: "string"
+    name: "--use_batch_normalization"
+    description: "Neural networks for which to enable batch normalization. \n"
+    info: null
+    default:
+    - "none"
+    required: false
+    choices:
+    - "encoder"
+    - "decoder"
+    - "none"
+    - "both"
+    direction: "input"
+    multiple: false
+    multiple_sep: ";"
+  - type: "boolean_false"
+    name: "--encode_covariates"
+    description: "Whether to concatenate covariates to expression in encoder"
+    info: null
+    direction: "input"
+  - type: "boolean_true"
+    name: "--deeply_inject_covariates"
+    description: "Whether to concatenate covariates into output of hidden layers in\
+      \ encoder/decoder. \nThis option only applies when n_layers > 1. The covariates\
+      \ are concatenated to\nthe input of subsequent hidden layers.\n"
+    info: null
+    direction: "input"
+  - type: "boolean_true"
+    name: "--use_observed_lib_size"
+    description: "Use observed library size for RNA as scaling factor in mean of conditional\
+      \ distribution.\n"
+    info: null
+    direction: "input"
+- name: "Early stopping arguments"
+  arguments:
+  - type: "boolean"
+    name: "--early_stopping"
+    description: "Whether to perform early stopping with respect to the validation\
+      \ set."
+    info: null
+    required: false
+    direction: "input"
+    multiple: false
+    multiple_sep: ";"
+  - type: "string"
+    name: "--early_stopping_monitor"
+    description: "Metric logged during validation set epoch."
+    info: null
+    default:
+    - "elbo_validation"
+    required: false
+    choices:
+    - "elbo_validation"
+    - "reconstruction_loss_validation"
+    - "kl_local_validation"
+    direction: "input"
+    multiple: false
+    multiple_sep: ";"
+  - type: "integer"
+    name: "--early_stopping_patience"
+    description: "Number of validation epochs with no improvement after which training\
+      \ will be stopped."
+    info: null
+    default:
+    - 45
+    required: false
+    min: 1
+    direction: "input"
+    multiple: false
+    multiple_sep: ";"
+  - type: "double"
+    name: "--early_stopping_min_delta"
+    description: "Minimum change in the monitored quantity to qualify as an improvement,\
+      \ i.e. an absolute change of less than min_delta, will count as no improvement."
+    info: null
+    default:
+    - 0.0
+    required: false
+    min: 0.0
+    direction: "input"
+    multiple: false
+    multiple_sep: ";"
+- name: "Learning parameters"
+  arguments:
+  - type: "integer"
+    name: "--max_epochs"
+    description: "Number of passes through the dataset, defaults to (20000 / number\
+      \ of cells) * 400 or 400; whichever is smallest."
+    info: null
+    required: false
+    direction: "input"
+    multiple: false
+    multiple_sep: ";"
+  - type: "boolean"
+    name: "--reduce_lr_on_plateau"
+    description: "Whether to monitor validation loss and reduce learning rate when\
+      \ validation set `lr_scheduler_metric` plateaus."
+    info: null
+    default:
+    - true
+    required: false
+    direction: "input"
+    multiple: false
+    multiple_sep: ";"
+  - type: "double"
+    name: "--lr_factor"
+    description: "Factor to reduce learning rate."
+    info: null
+    default:
+    - 0.6
+    required: false
+    min: 0.0
+    direction: "input"
+    multiple: false
+    multiple_sep: ";"
+  - type: "double"
+    name: "--lr_patience"
+    description: "Number of epochs with no improvement after which learning rate will\
+      \ be reduced."
+    info: null
+    default:
+    - 30.0
+    required: false
+    min: 0.0
+    direction: "input"
+    multiple: false
+    multiple_sep: ";"
+- name: "Data validition"
+  arguments:
+  - type: "integer"
+    name: "--n_obs_min_count"
+    description: "Minimum number of cells threshold ensuring that every obs_batch\
+      \ category has sufficient observations (cells) for model training."
+    info: null
+    default:
+    - 0
+    required: false
+    direction: "input"
+    multiple: false
+    multiple_sep: ";"
+  - type: "integer"
+    name: "--n_var_min_count"
+    description: "Minimum number of genes threshold ensuring that every var_input\
+      \ filter has sufficient observations (genes) for model training."
+    info: null
+    default:
+    - 0
+    required: false
+    direction: "input"
+    multiple: false
+    multiple_sep: ";"
+resources:
+- type: "python_script"
+  path: "script.py"
+  is_executable: true
+- type: "file"
+  path: "subset_vars.py"
+- type: "file"
+  path: "compress_h5mu.py"
+- type: "file"
+  path: "set_var_index.py"
+- type: "file"
+  path: "nextflow_labels.config"
+  dest: "nextflow_labels.config"
+description: "Performs scvi integration as done in the human lung cell atlas https://github.com/LungCellAtlas/HLCA"
+test_resources:
+- type: "python_script"
+  path: "test.py"
+  is_executable: true
+- type: "file"
+  path: "pbmc_1k_protein_v3_mms.h5mu"
+- type: "file"
+  path: "TS_Blood_filtered.h5mu"
+info: null
+status: "enabled"
+scope:
+  image: "public"
+  target: "public"
+license: "MIT"
+links:
+  repository: "https://github.com/openpipelines-bio/openpipeline"
+  docker_registry: "ghcr.io"
+runners:
+- type: "executable"
+  id: "executable"
+  docker_setup_strategy: "ifneedbepullelsecachedbuild"
+- type: "nextflow"
+  id: "nextflow"
+  directives:
+    label:
+    - "midcpu"
+    - "midmem"
+    - "gpu"
+    - "highdisk"
+    tag: "$id"
+  auto:
+    simplifyInput: true
+    simplifyOutput: false
+    transcript: false
+    publish: false
+  config:
+    labels:
+      mem1gb: "memory = 1000000000.B"
+      mem2gb: "memory = 2000000000.B"
+      mem5gb: "memory = 5000000000.B"
+      mem10gb: "memory = 10000000000.B"
+      mem20gb: "memory = 20000000000.B"
+      mem50gb: "memory = 50000000000.B"
+      mem100gb: "memory = 100000000000.B"
+      mem200gb: "memory = 200000000000.B"
+      mem500gb: "memory = 500000000000.B"
+      mem1tb: "memory = 1000000000000.B"
+      mem2tb: "memory = 2000000000000.B"
+      mem5tb: "memory = 5000000000000.B"
+      mem10tb: "memory = 10000000000000.B"
+      mem20tb: "memory = 20000000000000.B"
+      mem50tb: "memory = 50000000000000.B"
+      mem100tb: "memory = 100000000000000.B"
+      mem200tb: "memory = 200000000000000.B"
+      mem500tb: "memory = 500000000000000.B"
+      mem1gib: "memory = 1073741824.B"
+      mem2gib: "memory = 2147483648.B"
+      mem4gib: "memory = 4294967296.B"
+      mem8gib: "memory = 8589934592.B"
+      mem16gib: "memory = 17179869184.B"
+      mem32gib: "memory = 34359738368.B"
+      mem64gib: "memory = 68719476736.B"
+      mem128gib: "memory = 137438953472.B"
+      mem256gib: "memory = 274877906944.B"
+      mem512gib: "memory = 549755813888.B"
+      mem1tib: "memory = 1099511627776.B"
+      mem2tib: "memory = 2199023255552.B"
+      mem4tib: "memory = 4398046511104.B"
+      mem8tib: "memory = 8796093022208.B"
+      mem16tib: "memory = 17592186044416.B"
+      mem32tib: "memory = 35184372088832.B"
+      mem64tib: "memory = 70368744177664.B"
+      mem128tib: "memory = 140737488355328.B"
+      mem256tib: "memory = 281474976710656.B"
+      mem512tib: "memory = 562949953421312.B"
+      cpu1: "cpus = 1"
+      cpu2: "cpus = 2"
+      cpu5: "cpus = 5"
+      cpu10: "cpus = 10"
+      cpu20: "cpus = 20"
+      cpu50: "cpus = 50"
+      cpu100: "cpus = 100"
+      cpu200: "cpus = 200"
+      cpu500: "cpus = 500"
+      cpu1000: "cpus = 1000"
+    script:
+    - "includeConfig(\"nextflow_labels.config\")"
+  debug: false
+  container: "docker"
+engines:
+- type: "docker"
+  id: "docker"
+  image: "nvcr.io/nvidia/pytorch:25.05-py3"
+  target_registry: "images.viash-hub.com"
+  target_tag: "main"
+  namespace_separator: "/"
+  setup:
+  - type: "python"
+    user: false
+    packages:
+    - "anndata~=0.11.1"
+    - "mudata~=0.3.1"
+    - "scanpy~=1.10.4"
+    script:
+    - "exec(\"try:\\n  import awkward\\nexcept ModuleNotFoundError:\\n  exit(0)\\\
+      nelse:  exit(1)\")"
+    upgrade: true
+  - type: "python"
+    user: false
+    packages:
+    - "jax[cuda]"
+    - "scvi-tools~=1.3.1"
+    upgrade: true
+  test_setup:
+  - type: "python"
+    user: false
+    packages:
+    - "viashpy==0.8.0"
+    upgrade: true
+  entrypoint: []
+  cmd: null
+- type: "native"
+  id: "native"
+build_info:
+  config: "src/integrate/scvi/config.vsh.yaml"
+  runner: "executable"
+  engine: "docker|native"
+  output: "target/executable/integrate/scvi"
+  executable: "target/executable/integrate/scvi/scvi"
+  viash_version: "0.9.4"
+  git_commit: "173327cc5670aa8bd5cf473827de80b602c90092"
+  git_remote: "https://github.com/openpipelines-bio/openpipeline"
+  git_tag: "0.2.0-2055-g173327cc"
+package_config:
+  name: "openpipeline"
+  version: "main"
+  summary: "Best-practice workflows for single-cell multi-omics analyses.\n"
+  description: "OpenPipelines are extensible single cell analysis pipelines for reproducible\
+    \ and large-scale single cell processing using [Viash](https://viash.io) and [Nextflow](https://www.nextflow.io/).\n\
+    \nIn terms of workflows, the following has been made available, but keep in mind\
+    \ that\nindividual tools and functionality can be executed as standalone components\
+    \ as well.\n\n  * Demultiplexing: conversion of raw sequencing data to FASTQ objects.\n\
+    \  * Ingestion: Read mapping and generating a count matrix.\n  * Single sample\
+    \ processing: cell filtering and doublet detection.\n  * Multisample processing:\
+    \ Count transformation, normalization, QC metric calulations.\n  * Integration:\
+    \ Clustering, integration and batch correction using single and multimodal methods.\n\
+    \  * Downstream analysis workflows\n"
+  info:
+    test_resources:
+    - type: "s3"
+      path: "s3://openpipelines-data"
+      dest: "resources_test"
+    nextflow_labels_ci:
+    - path: "src/workflows/utils/labels_ci.config"
+      description: "Adds the correct memory and CPU labels when running on the Viash\
+        \ Hub CI."
+  viash_version: "0.9.4"
+  source: "src"
+  target: "target"
+  config_mods:
+  - ".resources += {path: '/src/workflows/utils/labels.config', dest: 'nextflow_labels.config'}\n\
+    .runners[.type == 'nextflow'].config.script := 'includeConfig(\"nextflow_labels.config\"\
+    )'\n"
+  - ".engines += { type: \"native\" }"
+  - ".engines[.type == 'docker'].target_registry := 'images.viash-hub.com'"
+  - ".engines[.type == 'docker'].target_tag := 'main'"
+  keywords:
+  - "single-cell"
+  - "multimodal"
+  license: "MIT"
+  organization: "vsh"
+  links:
+    repository: "https://github.com/openpipelines-bio/openpipeline"
+    docker_registry: "ghcr.io"
+    homepage: "https://openpipelines.bio"
+    documentation: "https://openpipelines.bio/fundamentals"
+    issue_tracker: "https://github.com/openpipelines-bio/openpipeline/issues"
--- a/target/executable/integrate/scvi/compress_h5mu.py
+++ b/target/executable/integrate/scvi/compress_h5mu.py
@@ -0,0 +1,87 @@
+import shutil
+from anndata import AnnData
+from mudata import write_h5ad
+from h5py import File as H5File
+from h5py import Group, Dataset
+from pathlib import Path
+from typing import Union, Literal
+from functools import partial
+
+
+def compress_h5mu(
+    input_path: Union[str, Path],
+    output_path: Union[str, Path],
+    compression: Union[Literal["gzip"], Literal["lzf"]],
+):
+    input_path, output_path = str(input_path), str(output_path)
+
+    def copy_attributes(in_object, out_object):
+        for key, value in in_object.attrs.items():
+            out_object.attrs[key] = value
+
+    def visit_path(
+        output_h5: H5File,
+        compression: Union[Literal["gzip"], Literal["lzf"]],
+        name: str,
+        object: Union[Group, Dataset],
+    ):
+        if isinstance(object, Group):
+            new_group = output_h5.create_group(name)
+            copy_attributes(object, new_group)
+        elif isinstance(object, Dataset):
+            # Compression only works for non-scalar Dataset objects
+            # Scalar objects dont have a shape defined
+            if not object.compression and object.shape not in [None, ()]:
+                new_dataset = output_h5.create_dataset(
+                    name, data=object, compression=compression
+                )
+                copy_attributes(object, new_dataset)
+            else:
+                output_h5.copy(object, name)
+        else:
+            raise NotImplementedError(
+                f"Could not copy element {name}, "
+                f"type has not been implemented yet: {type(object)}"
+            )
+
+    with (
+        H5File(input_path, "r") as input_h5,
+        H5File(output_path, "w", userblock_size=512) as output_h5,
+    ):
+        copy_attributes(input_h5, output_h5)
+        input_h5.visititems(partial(visit_path, output_h5, compression))
+
+    with open(input_path, "rb") as input_bytes:
+        # Mudata puts metadata like this in the first 512 bytes:
+        # MuData (format-version=0.1.0;creator=muon;creator-version=0.2.0)
+        # See mudata/_core/io.py, read_h5mu() function
+        starting_metadata = input_bytes.read(100)
+        # The metadata is padded with extra null bytes up until 512 bytes
+        truncate_location = starting_metadata.find(b"\x00")
+        starting_metadata = starting_metadata[:truncate_location]
+    with open(output_path, "br+") as f:
+        nbytes = f.write(starting_metadata)
+        f.write(b"\0" * (512 - nbytes))
+
+
+def write_h5ad_to_h5mu_with_compression(
+    output_file: Union[str, Path],
+    h5mu: Union[str, Path],
+    modality_name: str,
+    modality_data: AnnData,
+    output_compression=None,
+):
+    output_file = Path(output_file)
+    h5mu = Path(h5mu)
+    output_file_uncompressed = (
+        output_file.with_name(output_file.stem + "_uncompressed.h5mu")
+        if output_compression
+        else output_file
+    )
+    shutil.copyfile(h5mu, output_file_uncompressed)
+    write_h5ad(filename=output_file_uncompressed, mod=modality_name, data=modality_data)
+    if output_compression:
+        compress_h5mu(
+            output_file_uncompressed, output_file, compression=output_compression
+        )
+        output_file_uncompressed.unlink()
--- a/target/executable/integrate/scvi/nextflow_labels.config
+++ b/target/executable/integrate/scvi/nextflow_labels.config
@@ -0,0 +1,48 @@
+process {
+  // Default resources for components that hardly do any processing
+  memory = { 2.GB * task.attempt }
+  cpus = 1
+
+  // Retry for exit codes that have something to do with memory issues
+  errorStrategy = { task.exitStatus in 137..140 ? 'retry' : 'terminate' }
+  maxRetries = 3
+
+  // The memory a task is assinged increases with each attempt
+  // uncomment the line below and adjust the value to set a global upper limit on the memory.
+  // resourceLimits = [ memory: 240.Gb ] 
+
+  // CPU resources
+  withLabel: singlecpu { cpus = 1 }
+  withLabel: lowcpu { cpus = 4 }
+  withLabel: midcpu { cpus = 10 }
+  withLabel: highcpu { cpus = 20 }
+  
+  // Memory resources
+  withLabel: lowmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 4.GB * task.attempt } }
+  withLabel: midmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 25.GB * task.attempt } }
+  withLabel: highmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 50.GB * task.attempt } }
+  withLabel: veryhighmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 75.GB * task.attempt } }
+
+  // Disk space
+  withLabel: lowdisk {
+    disk = {process.disk ? process.disk : null}
+  }
+  withLabel: middisk {
+    disk = {process.disk ? process.disk : null}
+  }
+  withLabel: highdisk {
+    disk = {process.disk ? process.disk : null}
+  }
+  withLabel: veryhighdisk {
+    disk = {process.disk ? process.disk : null}
+  }
+  
+  // NOTE: The above labels intentionally do not have an effect by default.
+  // The user should set the disk space requirements by adding the following
+  // to the compute environment:
+  //
+  // withLabel: lowdisk { disk = { 20.GB * task.attempt } }
+  // withLabel: middisk { disk = { 100.GB * task.attempt } }
+  // withLabel: highdisk { disk = { 200.GB * task.attempt } }
+  // withLabel: veryhighdisk { disk = { 500.GB * task.attempt } }
+}
--- a/target/executable/integrate/scvi/scvi
+++ b/target/executable/integrate/scvi/scvi
--- a/target/executable/integrate/scvi/set_var_index.py
+++ b/target/executable/integrate/scvi/set_var_index.py
@@ -0,0 +1,24 @@
+import anndata as ad
+import re
+
+
+def set_var_index(adata: ad.AnnData, var_name: str | None = None) -> ad.AnnData:
+    """Sanitize gene names and set the index of the .var DataFrame.
+
+    Parameters
+    ----------
+    adata : AnnData
+        Annotated data object
+    var_name : str | None
+        Name of the column in `adata.var` that contains the gene names, if None, the existing index will be sanitized but not replaced.
+
+    Returns
+    -------
+    AnnData
+        Copy of `adata` with sanitized and replaced index
+    """
+    if var_name:
+        adata.var.index = [re.sub("\\.[0-9]+$", "", s) for s in adata.var[var_name]]
+    else:
+        adata.var.index = [re.sub("\\.[0-9]+$", "", s) for s in adata.var.index]
+    return adata
--- a/target/executable/integrate/scvi/subset_vars.py
+++ b/target/executable/integrate/scvi/subset_vars.py
@@ -0,0 +1,31 @@
+def subset_vars(adata, subset_col):
+    """Subset AnnData object on highly variable genes
+
+    Parameters
+    ----------
+    adata : AnnData
+        Annotated data object
+    subset_col : str
+        Name of the boolean column in `adata.var` that contains the information if features should be used or not
+
+    Returns
+    -------
+    AnnData
+        Copy of `adata` with subsetted features
+    """
+    if subset_col not in adata.var.columns:
+        raise ValueError(
+            f"Requested to use .var column '{subset_col}' as a selection of genes, but the column is not available."
+        )
+
+    if adata.var[subset_col].dtype == "boolean":
+        assert adata.var[subset_col].isna().sum() == 0, (
+            f"The .var column `{subset_col}` contains NaN values. Can not subset data."
+        )
+        adata.var[subset_col] = adata.var[subset_col].astype("bool")
+
+    assert adata.var[subset_col].dtype == "bool", (
+        f"Expected dtype of .var column '{subset_col}' to be `bool`, but found {adata.var[subset_col].dtype}. Can not subset data."
+    )
+
+    return adata[:, adata.var[subset_col]].copy()
--- a/target/executable/integrate/totalvi/.config.vsh.yaml
+++ b/target/executable/integrate/totalvi/.config.vsh.yaml
@@ -0,0 +1,395 @@
+name: "totalvi"
+namespace: "integrate"
+version: "main"
+authors:
+- name: "Vladimir Shitov"
+  info:
+    role: "Contributor"
+    links:
+      email: "vladimir.shitov@helmholtz-muenchen.de"
+      github: "vladimirshitov"
+      orcid: "0000-0002-1960-8812"
+      linkedin: "vladimir-shitov-9a659513b"
+    organizations:
+    - name: "Helmholtz Munich"
+      href: "https://www.helmholtz-munich.de"
+      role: "PhD Candidate"
+argument_groups:
+- name: "Inputs"
+  arguments:
+  - type: "file"
+    name: "--input"
+    alternatives:
+    - "-i"
+    description: "Input h5mu file with query data to integrate with reference."
+    info: null
+    must_exist: true
+    create_parent: true
+    required: true
+    direction: "input"
+    multiple: false
+    multiple_sep: ";"
+  - type: "file"
+    name: "--reference"
+    alternatives:
+    - "-r"
+    description: "Input h5mu file with reference data to train the TOTALVI model."
+    info: null
+    must_exist: true
+    create_parent: true
+    required: true
+    direction: "input"
+    multiple: false
+    multiple_sep: ";"
+  - type: "boolean_true"
+    name: "--force_retrain"
+    alternatives:
+    - "-f"
+    description: "If true, retrain the model and save it to reference_model_path"
+    info: null
+    direction: "input"
+  - type: "string"
+    name: "--query_modality"
+    info: null
+    default:
+    - "rna"
+    required: false
+    direction: "input"
+    multiple: false
+    multiple_sep: ";"
+  - type: "string"
+    name: "--query_proteins_modality"
+    description: "Name of the modality in the input (query) h5mu file containing protein\
+      \ data"
+    info: null
+    required: false
+    direction: "input"
+    multiple: false
+    multiple_sep: ";"
+  - type: "string"
+    name: "--reference_modality"
+    info: null
+    default:
+    - "rna"
+    required: false
+    direction: "input"
+    multiple: false
+    multiple_sep: ";"
+  - type: "string"
+    name: "--reference_proteins_modality"
+    description: "Name of the modality containing proteins in the reference"
+    info: null
+    default:
+    - "prot"
+    required: false
+    direction: "input"
+    multiple: false
+    multiple_sep: ";"
+  - type: "string"
+    name: "--input_layer"
+    description: "Input layer to use. If None, X is used"
+    info: null
+    required: false
+    direction: "input"
+    multiple: false
+    multiple_sep: ";"
+  - type: "string"
+    name: "--obs_batch"
+    description: "Column name discriminating between your batches."
+    info: null
+    default:
+    - "sample_id"
+    required: false
+    direction: "input"
+    multiple: false
+    multiple_sep: ";"
+  - type: "string"
+    name: "--var_input"
+    description: ".var column containing highly variable genes. By default, do not\
+      \ subset genes."
+    info: null
+    required: false
+    direction: "input"
+    multiple: false
+    multiple_sep: ";"
+- name: "Outputs"
+  arguments:
+  - type: "file"
+    name: "--output"
+    alternatives:
+    - "-o"
+    description: "Output h5mu file."
+    info: null
+    must_exist: true
+    create_parent: true
+    required: true
+    direction: "output"
+    multiple: false
+    multiple_sep: ";"
+  - type: "string"
+    name: "--obsm_output"
+    description: "In which .obsm slot to store the resulting integrated embedding."
+    info: null
+    default:
+    - "X_integrated_totalvi"
+    required: false
+    direction: "input"
+    multiple: false
+    multiple_sep: ";"
+  - type: "string"
+    name: "--obsm_normalized_rna_output"
+    description: "In which .obsm slot to store the normalized RNA from TOTALVI."
+    info: null
+    default:
+    - "X_totalvi_normalized_rna"
+    required: false
+    direction: "input"
+    multiple: false
+    multiple_sep: ";"
+  - type: "string"
+    name: "--obsm_normalized_protein_output"
+    description: "In which .obsm slot to store the normalized protein data from TOTALVI."
+    info: null
+    default:
+    - "X_totalvi_normalized_protein"
+    required: false
+    direction: "input"
+    multiple: false
+    multiple_sep: ";"
+  - type: "file"
+    name: "--reference_model_path"
+    description: "Directory with the reference model. If not exists, trained model\
+      \ will be saved there"
+    info: null
+    default:
+    - "totalvi_model_reference"
+    must_exist: true
+    create_parent: true
+    required: false
+    direction: "output"
+    multiple: false
+    multiple_sep: ";"
+  - type: "file"
+    name: "--query_model_path"
+    description: "Directory, where the query model will be saved"
+    info: null
+    default:
+    - "totalvi_model_query"
+    must_exist: true
+    create_parent: true
+    required: false
+    direction: "output"
+    multiple: false
+    multiple_sep: ";"
+- name: "Learning parameters"
+  arguments:
+  - type: "integer"
+    name: "--max_epochs"
+    description: "Number of passes through the dataset"
+    info: null
+    default:
+    - 400
+    required: false
+    direction: "input"
+    multiple: false
+    multiple_sep: ";"
+  - type: "integer"
+    name: "--max_query_epochs"
+    description: "Number of passes through the dataset, when fine-tuning model for\
+      \ query"
+    info: null
+    default:
+    - 200
+    required: false
+    direction: "input"
+    multiple: false
+    multiple_sep: ";"
+  - type: "double"
+    name: "--weight_decay"
+    description: "Weight decay, when fine-tuning model for query"
+    info: null
+    default:
+    - 0.0
+    required: false
+    direction: "input"
+    multiple: false
+    multiple_sep: ";"
+resources:
+- type: "python_script"
+  path: "script.py"
+  is_executable: true
+- type: "file"
+  path: "setup_logger.py"
+- type: "file"
+  path: "nextflow_labels.config"
+  dest: "nextflow_labels.config"
+description: "Performs mapping to the reference by totalvi model: https://docs.scvi-tools.org/en/stable/tutorials/notebooks/scarches_scvi_tools.html#Reference-mapping-with-TOTALVI"
+test_resources:
+- type: "python_script"
+  path: "test.py"
+  is_executable: true
+- type: "file"
+  path: "pbmc_1k_protein_v3_mms.h5mu"
+info: null
+status: "enabled"
+scope:
+  image: "public"
+  target: "public"
+license: "MIT"
+links:
+  repository: "https://github.com/openpipelines-bio/openpipeline"
+  docker_registry: "ghcr.io"
+runners:
+- type: "executable"
+  id: "executable"
+  docker_setup_strategy: "ifneedbepullelsecachedbuild"
+- type: "nextflow"
+  id: "nextflow"
+  directives:
+    label:
+    - "highmem"
+    - "highcpu"
+    - "highdisk"
+    tag: "$id"
+  auto:
+    simplifyInput: true
+    simplifyOutput: false
+    transcript: false
+    publish: false
+  config:
+    labels:
+      mem1gb: "memory = 1000000000.B"
+      mem2gb: "memory = 2000000000.B"
+      mem5gb: "memory = 5000000000.B"
+      mem10gb: "memory = 10000000000.B"
+      mem20gb: "memory = 20000000000.B"
+      mem50gb: "memory = 50000000000.B"
+      mem100gb: "memory = 100000000000.B"
+      mem200gb: "memory = 200000000000.B"
+      mem500gb: "memory = 500000000000.B"
+      mem1tb: "memory = 1000000000000.B"
+      mem2tb: "memory = 2000000000000.B"
+      mem5tb: "memory = 5000000000000.B"
+      mem10tb: "memory = 10000000000000.B"
+      mem20tb: "memory = 20000000000000.B"
+      mem50tb: "memory = 50000000000000.B"
+      mem100tb: "memory = 100000000000000.B"
+      mem200tb: "memory = 200000000000000.B"
+      mem500tb: "memory = 500000000000000.B"
+      mem1gib: "memory = 1073741824.B"
+      mem2gib: "memory = 2147483648.B"
+      mem4gib: "memory = 4294967296.B"
+      mem8gib: "memory = 8589934592.B"
+      mem16gib: "memory = 17179869184.B"
+      mem32gib: "memory = 34359738368.B"
+      mem64gib: "memory = 68719476736.B"
+      mem128gib: "memory = 137438953472.B"
+      mem256gib: "memory = 274877906944.B"
+      mem512gib: "memory = 549755813888.B"
+      mem1tib: "memory = 1099511627776.B"
+      mem2tib: "memory = 2199023255552.B"
+      mem4tib: "memory = 4398046511104.B"
+      mem8tib: "memory = 8796093022208.B"
+      mem16tib: "memory = 17592186044416.B"
+      mem32tib: "memory = 35184372088832.B"
+      mem64tib: "memory = 70368744177664.B"
+      mem128tib: "memory = 140737488355328.B"
+      mem256tib: "memory = 281474976710656.B"
+      mem512tib: "memory = 562949953421312.B"
+      cpu1: "cpus = 1"
+      cpu2: "cpus = 2"
+      cpu5: "cpus = 5"
+      cpu10: "cpus = 10"
+      cpu20: "cpus = 20"
+      cpu50: "cpus = 50"
+      cpu100: "cpus = 100"
+      cpu200: "cpus = 200"
+      cpu500: "cpus = 500"
+      cpu1000: "cpus = 1000"
+    script:
+    - "includeConfig(\"nextflow_labels.config\")"
+  debug: false
+  container: "docker"
+engines:
+- type: "docker"
+  id: "docker"
+  image: "nvcr.io/nvidia/pytorch:25.05-py3"
+  target_registry: "images.viash-hub.com"
+  target_tag: "main"
+  namespace_separator: "/"
+  setup:
+  - type: "python"
+    user: false
+    packages:
+    - "anndata~=0.11.1"
+    - "mudata~=0.3.1"
+    - "jax[cuda]"
+    - "scvi-tools~=1.3.1"
+    script:
+    - "exec(\"try:\\n  import awkward\\nexcept ModuleNotFoundError:\\n  exit(0)\\\
+      nelse:  exit(1)\")"
+    upgrade: true
+  test_setup:
+  - type: "python"
+    user: false
+    packages:
+    - "viashpy==0.8.0"
+    upgrade: true
+  entrypoint: []
+  cmd: null
+- type: "native"
+  id: "native"
+build_info:
+  config: "src/integrate/totalvi/config.vsh.yaml"
+  runner: "executable"
+  engine: "docker|native"
+  output: "target/executable/integrate/totalvi"
+  executable: "target/executable/integrate/totalvi/totalvi"
+  viash_version: "0.9.4"
+  git_commit: "173327cc5670aa8bd5cf473827de80b602c90092"
+  git_remote: "https://github.com/openpipelines-bio/openpipeline"
+  git_tag: "0.2.0-2055-g173327cc"
+package_config:
+  name: "openpipeline"
+  version: "main"
+  summary: "Best-practice workflows for single-cell multi-omics analyses.\n"
+  description: "OpenPipelines are extensible single cell analysis pipelines for reproducible\
+    \ and large-scale single cell processing using [Viash](https://viash.io) and [Nextflow](https://www.nextflow.io/).\n\
+    \nIn terms of workflows, the following has been made available, but keep in mind\
+    \ that\nindividual tools and functionality can be executed as standalone components\
+    \ as well.\n\n  * Demultiplexing: conversion of raw sequencing data to FASTQ objects.\n\
+    \  * Ingestion: Read mapping and generating a count matrix.\n  * Single sample\
+    \ processing: cell filtering and doublet detection.\n  * Multisample processing:\
+    \ Count transformation, normalization, QC metric calulations.\n  * Integration:\
+    \ Clustering, integration and batch correction using single and multimodal methods.\n\
+    \  * Downstream analysis workflows\n"
+  info:
+    test_resources:
+    - type: "s3"
+      path: "s3://openpipelines-data"
+      dest: "resources_test"
+    nextflow_labels_ci:
+    - path: "src/workflows/utils/labels_ci.config"
+      description: "Adds the correct memory and CPU labels when running on the Viash\
+        \ Hub CI."
+  viash_version: "0.9.4"
+  source: "src"
+  target: "target"
+  config_mods:
+  - ".resources += {path: '/src/workflows/utils/labels.config', dest: 'nextflow_labels.config'}\n\
+    .runners[.type == 'nextflow'].config.script := 'includeConfig(\"nextflow_labels.config\"\
+    )'\n"
+  - ".engines += { type: \"native\" }"
+  - ".engines[.type == 'docker'].target_registry := 'images.viash-hub.com'"
+  - ".engines[.type == 'docker'].target_tag := 'main'"
+  keywords:
+  - "single-cell"
+  - "multimodal"
+  license: "MIT"
+  organization: "vsh"
+  links:
+    repository: "https://github.com/openpipelines-bio/openpipeline"
+    docker_registry: "ghcr.io"
+    homepage: "https://openpipelines.bio"
+    documentation: "https://openpipelines.bio/fundamentals"
+    issue_tracker: "https://github.com/openpipelines-bio/openpipeline/issues"
--- a/target/executable/integrate/totalvi/nextflow_labels.config
+++ b/target/executable/integrate/totalvi/nextflow_labels.config
@@ -0,0 +1,48 @@
+process {
+  // Default resources for components that hardly do any processing
+  memory = { 2.GB * task.attempt }
+  cpus = 1
+
+  // Retry for exit codes that have something to do with memory issues
+  errorStrategy = { task.exitStatus in 137..140 ? 'retry' : 'terminate' }
+  maxRetries = 3
+
+  // The memory a task is assinged increases with each attempt
+  // uncomment the line below and adjust the value to set a global upper limit on the memory.
+  // resourceLimits = [ memory: 240.Gb ] 
+
+  // CPU resources
+  withLabel: singlecpu { cpus = 1 }
+  withLabel: lowcpu { cpus = 4 }
+  withLabel: midcpu { cpus = 10 }
+  withLabel: highcpu { cpus = 20 }
+  
+  // Memory resources
+  withLabel: lowmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 4.GB * task.attempt } }
+  withLabel: midmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 25.GB * task.attempt } }
+  withLabel: highmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 50.GB * task.attempt } }
+  withLabel: veryhighmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 75.GB * task.attempt } }
+
+  // Disk space
+  withLabel: lowdisk {
+    disk = {process.disk ? process.disk : null}
+  }
+  withLabel: middisk {
+    disk = {process.disk ? process.disk : null}
+  }
+  withLabel: highdisk {
+    disk = {process.disk ? process.disk : null}
+  }
+  withLabel: veryhighdisk {
+    disk = {process.disk ? process.disk : null}
+  }
+  
+  // NOTE: The above labels intentionally do not have an effect by default.
+  // The user should set the disk space requirements by adding the following
+  // to the compute environment:
+  //
+  // withLabel: lowdisk { disk = { 20.GB * task.attempt } }
+  // withLabel: middisk { disk = { 100.GB * task.attempt } }
+  // withLabel: highdisk { disk = { 200.GB * task.attempt } }
+  // withLabel: veryhighdisk { disk = { 500.GB * task.attempt } }
+}
--- a/target/executable/integrate/totalvi/setup_logger.py
+++ b/target/executable/integrate/totalvi/setup_logger.py
@@ -0,0 +1,12 @@
+def setup_logger():
+    import logging
+    from sys import stdout
+
+    logger = logging.getLogger()
+    logger.setLevel(logging.INFO)
+    console_handler = logging.StreamHandler(stdout)
+    logFormatter = logging.Formatter("%(asctime)s %(levelname)-8s %(message)s")
+    console_handler.setFormatter(logFormatter)
+    logger.addHandler(console_handler)
+
+    return logger
--- a/target/executable/integrate/totalvi/totalvi
+++ b/target/executable/integrate/totalvi/totalvi