Build branch fix-integration-tests with version fix-integration-tests (da62b4ff)

Build pipeline: vsh-ci-dev-gckj5 Source commit: da62b4ffe3 Source message: Add labels to qc_test component
2024-11-15 14:37:33 +00:00
parent 43cfb251c7
commit bb7533583f
1550 changed files with 913066 additions and 791 deletions
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -1,4 +1,4 @@
-# openpipelines x.x.x
+# openpipelines 2.x.x (Unreleased)
 ## BREAKING CHANGES
@@ -42,8 +42,6 @@
  - Store label probabilities instead of uncertainties
  - Take `.h5mu` format as an input instead of `.h5ad`
 * `labels_transfer/knn`: delete outdated component due to its functionality now implemented in `labels_transfer/pynndescent_knn`
 * `reference/build_cellranger_arc_reference`: a default value of "output" is now specified for the argument `--genome`, inline with `reference/build_cellranger_reference` component. Additionally, providing a value for `--organism` is no longer required and its default value of `Homo Sapiens` has been removed (PR #864).
 ## NEW FUNCTIONALITY
@@ -91,8 +89,6 @@
 * `dataflow/split_h5mu` component: Added a component to split a single h5mu file into multiple h5mu files based on the values of an .obs column (PR #824).
 * `labels_transfer/pynndescent_knn`: component: Added a component for KNN classification based on a PyNNDescent neighborhood graph (PR #830).
 * `workflows/test_workflows/ingestion` components & `workflows/ingestion`: Added standalone components for integration testing of ingestion workflows (PR #801). 
 * `workflows/ingestion/make_reference`: Add additional arguments passed through to the STAR and BD Rhapsody reference components (PR #846).
@@ -103,7 +99,7 @@
 * `dimred/densmap` component: Added a densMAP dimensionality reduction component (PR #748).
-* `annotete/scanvi` component: Added a component to annotate cells using scANVI (PR #833).
+* `annotate/scanvi` component: Added a component to annotate cells using scANVI (PR #833).
 * `transform/bpcells_regress_out` component: Added a component to regress out effects of confounding variables in the count matrix using BPCells (PR #863).
@@ -129,6 +125,10 @@
 * `metadata/duplicate_var` component: Added a component to make a copy from one .var field or index to another .var field within the same MuData object (PR #877).
 * `filter/subset_obsp` component: Added a component to subset an .obsp matrix by column based on the value of an .obs field. The resulting subset is moved to an .obsm field (PR #888).
 * `labels_transfer/knn` component: Enable using additional distance functions for KNN classification (PR #830) and allow to perform KNN classification based on a pre-calculated neighborhood graph (PR #890).
 ## MINOR CHANGES
 * `resources_test_scripts/cellranger_atac_tiny_bcl.sh` script: generate counts from fastq files using CellRanger atac count (PR #726).
@@ -142,8 +142,6 @@
 * Bump scvelo to `0.3.2` (PR #828).
 * Bump viash to `0.8.6` (PR #815).
 * Pin numpy<2 for several components (PR #815).
 * Added `resources_test_scripts/cellranger_atac_tiny_bcl.sh` script: download tiny bcl file with an ATAC experiment, download a motifs file, demultiplex bcl files to reads in fastq format (PR #726).
@@ -162,23 +160,38 @@
 ## BUG FIXES
 * `dataflow/concatenate_h5mu`: fix writing out multidimensional annotation dataframes (e.g. `.varm`) that had their 
  data dtype (dtype) changed as a result of adding more observations after concatenation, causing `TypeError`.
  One notable example of this happening is when one of the samples does not have a multimodal annotation dataframe 
  which is present in another sample; causing the values being filled with `NA` (PR #837).
 * `qc/calculate_qc_metrics`: increase total counts accuracy with low precision floating dtypes as input layer (PR #852).
 * Fix failing tests for `ingestion/cellranger_postprocessing`, `ingestion/conversion` and `multiomics/process_batches` (PR #869).
 * `convert/from_10xh5_to_h5mu`: add .uns slot to mdata root when metrics file is provided (PR #887).
 * Use `params.resources_test` in test workflows in order to point to an alternative location (e.g. a cache).
 * Fix ingestion components not working when optional arguments are unset (PR #894).
 ## DOCUMENTATION
 * Update authorship of components (PR #835).
 # openpipelines 1.0.3
 ## BUG FIXES
 * `qc/calculate_qc_metrics`: increase total counts accuracy with low precision floating dtypes as input layer (PR # , backported from PR #852).
 # openpipelines 1.0.2
 ## BUG FIXES
 * `dataflow/concatenate_h5mu`: fix writing out multidimensional annotation dataframes (e.g. `.varm`) that had their 
  data dtype (dtype) changed as a result of adding more observations after concatenation, causing `TypeError`.
  One notable example of this happening is when one of the samples does not have a multimodal annotation dataframe 
  which is present in another sample; causing the values being filled with `NA` (PR #842, backported from PR #837).
 # openpipelines 1.0.1
 ## BUG FIXES
 * Bump viash to `0.8.6` (PR #816, backported from #815). This changes the at-runtime generated nextflow process from an in-memory to an on-disk temporary file, which should cause less issues with Nextflow Fusion.
 # openpipelines 1.0.0-rc6
 ## BUG FIXES
--- a/src/annotate/scanvi/config.vsh.yaml
+++ b/src/annotate/scanvi/config.vsh.yaml
@@ -23,27 +23,26 @@ argument_groups:
        type: string
        default: "rna"
        required: false
      - name: "--var_input_gene_names"
        description: .var field containing the gene names, if the .var index is not to be used.
        type: string
        required: false
-  - name: Reference
+  - name: Reference model
-    description: Arguments related to the reference dataset.
+    description: Arguments related to the reference model.
    arguments:
      - name: "--reference"
        type: file
        description: Reference h5mu file.
        direction: input
        required: true
        example: reference.h5mu
      - name: "--scvi_reference_model"
        type: file
-        description: "Pretrained scvi reference model"
+        description: "Pretrained SCVI reference model to initialize the SCANVI model with. The model needs to include the AnnData object used to trained the model stored. "
        example: scvi_model.pt
        direction: input
-        required: true
+        required: false
-      - name: "--reference_obs_label"
+      - name: "--scanvi_reference_model"
-        type: string
+        type: file
-        description: Key in obs field of reference AnnData with cell-type information.
+        description: "Pretrained SCANVI reference model."
-        example: "cell_ontology_class"
+        example: scvi_model.pt
-        required: true
+        direction: input
        required: false
  - name: SCANVI reference model training arguments
    description: Arguments related to the reference SCANVI model.
@@ -190,6 +189,7 @@ resources:
  - type: python_script
    path: script.py
  - path: /src/utils/setup_logger.py
  - path: /src/annotate/utils/query_reference_allignment.py
 test_resources:
  - type: python_script
--- a/src/annotate/scanvi/script.py
+++ b/src/annotate/scanvi/script.py
@@ -7,14 +7,39 @@ import numpy as np
 par = {
    "input": "resources_test/pbmc_1k_protein_v3/pbmc_1k_protein_v3_mms.h5mu",
    "modality": "rna",
-    "reference": "resources_test/annotation_test_data/TS_Blood_filtered.h5ad",
+    "var_query_gene_names": None,
-    "scvi_reference_model": "resources_test/annotation_test_data/scvi_model.pt",
+    "scvi_reference_model": "resources_test/annotation_test_data/scvi_model",
-    "reference_obs_label": "cell_ontology_class",
+    "scanvi_reference_model": None,
    "unknown_celltype": "Unkown",
    "output": "output.h5mu",
    "output_obsm_scanvi_embedding": "scanvi_embedding",
    "output_obs_predictions": "scanvi_pred",
    "output_obs_probability": "scanvi_probability",
    "output_model": None,
    "output_compression": None,
    "reference_learning_rate": 1e-3,
    "reference_reduce_lr_on_plateau": True,
    "reference_lr_patience": 25,
    "reference_lr_factor": 0.5,
    "reference_train_size": 0.9,
    "reference_max_epochs": 10,
    "reference_early_stopping": True,
    "reference_early_stopping_patience": 50,
    "query_train_size": 0.9,
    "query_max_epochs": 10,
    "query_learning_rate": 1e-3,
    "query_reduce_lr_on_plateau": True,
    "query_lr_patience": 25,
    "query_lr_factor": 0.5,
    "query_early_stopping": True,
    "query_early_stopping_patience": 50
 }
-meta = {}
+meta = {"resources_dir": "src/annotate/utils"}
 ## VIASH END
 sys.path.append(meta["resources_dir"])
 from query_reference_allignment import set_var_index, cross_check_genes
 # START TEMPORARY WORKAROUND setup_logger
 # reason: resources aren't available when using Nextflow fusion
 # from setup_logger import setup_logger
@@ -33,72 +58,104 @@ def setup_logger():
 # END TEMPORARY WORKAROUND setup_logger
 logger = setup_logger()
-logger.info("Reading the input and reference data")
+if (not par["scvi_reference_model"]) and not (par["scanvi_reference_model"]) or (par["scvi_reference_model"] and par["scanvi_reference_model"]):
    raise ValueError("Make sure to provide either an '--scvi_reference_model' or a '--scanvi_reference_model', but not both.")
 input_data = mu.read_h5mu(par["input"])
 query = input_data.mod[par["modality"]]
 reference_data = mu.read_h5mu(par["reference"])
 reference = reference_data.mod[par["modality"]]
-logger.info(f"Loading the pretrained scVI model from {par['scvi_reference_model']}")
+def main():
-scvi_reference_model = scvi.model.SCVI.load(par["scvi_reference_model"], reference)
+    logger.info("Reading the query data")
    # Read in data
    input_data = mu.read_h5mu(par["input"])
    input_modality = input_data.mod[par["modality"]].copy()
    # scANVI requires query and reference gene names to be equivalent 
    input_modality = set_var_index(input_modality, par["var_input_gene_names"])
-logger.info("Setting up scANVI model")
+    if par["scanvi_reference_model"]:
-scanvi_ref = scvi.model.SCANVI.from_scvi_model(
+        logger.info(f"Loading the pretrained scANVI model from {par['scanvi_reference_model']} and updating it with the query data {par['input']}")
-    scvi_reference_model,
+        scanvi_query = scvi.model.SCANVI.load_query_data(
-    unlabeled_category=par["unknown_celltype"],
+            input_modality,
-    labels_key=par["reference_obs_label"],
+            par["scanvi_reference_model"],
            freeze_classifier=True,
            inplace_subset_query_vars=True
            )
    elif par["scvi_reference_model"]:
        logger.info("Reading in the reference model and associated reference data")
        scvi_reference_model = scvi.model.SCVI.load(par["scvi_reference_model"])
        reference = scvi_reference_model.adata
        logger.info("Alligning genes in reference and query dataset")
        # scANVI requires query and reference gene names to be equivalent 
        reference = set_var_index(reference)
        # Subset query dataset based on genes present in reference
        common_ens_ids = cross_check_genes(input_modality, reference)
        input_modality = input_modality[:, common_ens_ids]
        logger.info("Instantiating scANVI model from the scVI model")
        scanvi_ref = scvi.model.SCANVI.from_scvi_model(
            scvi_reference_model,
            unlabeled_category=par["unknown_celltype"],
            labels_key=scvi_reference_model.adata_manager._registry["setup_args"]["labels_key"],
            )
        reference_plan_kwargs = {
            "lr": par["reference_learning_rate"],
            "reduce_lr_on_plateau": par['reference_reduce_lr_on_plateau'],
            "lr_patience": par['reference_lr_patience'],
            "lr_factor": par['reference_lr_factor']
            }
        logger.info("Training scANVI model on reference data with celltype labels")
        scanvi_ref.train(
            train_size=par["reference_train_size"],
            max_epochs=par['reference_max_epochs'],
            early_stopping=par['reference_early_stopping'],
            early_stopping_patience=par['reference_early_stopping_patience'],
            plan_kwargs=reference_plan_kwargs,
            check_val_every_n_epoch=1,
            accelerator="auto"
        )
        logger.info(f"Updating scANVI model with query data {par['input']}")
        scvi.model.SCANVI.prepare_query_anndata(input_modality, scanvi_ref, inplace=True)
        scanvi_query = scvi.model.SCANVI.load_query_data(input_modality, scanvi_ref)
    logger.info("Training scANVI model with query data")
    query_plan_kwargs = {
        "lr": par["query_learning_rate"],
        "reduce_lr_on_plateau": par['query_reduce_lr_on_plateau'],
        "lr_patience": par['query_lr_patience'],
        "lr_factor": par['query_lr_factor']
        }
    scanvi_query.train(
        train_size=par["query_train_size"],
        max_epochs=par['query_max_epochs'],
        early_stopping=par['query_early_stopping'],
        early_stopping_patience=par['query_early_stopping_patience'],
        plan_kwargs=query_plan_kwargs,
        check_val_every_n_epoch=1,
        accelerator="auto"
    )
-reference_plan_kwargs = {"lr": par["reference_learning_rate"],
+    logger.info("Adding latent representation to query data")
-                         "reduce_lr_on_plateau": par['reference_reduce_lr_on_plateau'],
+    input_modality.obsm[par["output_obsm_scanvi_embedding"]] = scanvi_query.get_latent_representation()
                         "lr_patience": par['reference_lr_patience'],
                         "lr_factor": par['reference_lr_factor']
                        }
-logger.info("Training scANVI model on reference data with celltype labels")
+    logger.info("Running predictions on query data")
    input_modality.obs[par["output_obs_predictions"]] = scanvi_query.predict(input_modality)
    input_modality.obs[par["output_obs_probability"]] = np.max(scanvi_query.predict(input_modality, soft=True), axis=1)
-scanvi_ref.train(
+    logger.info("Saving output and model")
-    train_size=par["reference_train_size"],
+    input_data.mod[par["modality"]] = input_modality
-    max_epochs=par['reference_max_epochs'],
+    input_data.write_h5mu(par["output"], compression=par["output_compression"])
    early_stopping=par['reference_early_stopping'],
    early_stopping_patience=par['reference_early_stopping_patience'],
    plan_kwargs=reference_plan_kwargs,
    check_val_every_n_epoch=1,
    accelerator="auto",
 )
-logger.info("Updating and training scANVI model with query data")
+    if par["output_model"]:
-scvi.model.SCANVI.prepare_query_anndata(query, scanvi_ref, inplace=True)
+        scanvi_query.save(par["output_model"], overwrite=True)
 scanvi_query = scvi.model.SCANVI.load_query_data(query, scanvi_ref)
 query_plan_kwargs = {"lr": par["query_learning_rate"],
                     "reduce_lr_on_plateau": par['query_reduce_lr_on_plateau'],
                     "lr_patience": par['query_lr_patience'],
                     "lr_factor": par['query_lr_factor']
                    }
-scanvi_query.train(
+if __name__ == '__main__':
-    train_size=par["query_train_size"],
+    main()
    max_epochs=par['query_max_epochs'],
    early_stopping=par['query_early_stopping'],
    early_stopping_patience=par['query_early_stopping_patience'],
    plan_kwargs=query_plan_kwargs,
    check_val_every_n_epoch=1,
    accelerator="auto",
 )
 logger.info("Adding latent representation to query data")
 query.obsm[par["output_obsm_scanvi_embedding"]] = scanvi_query.get_latent_representation()
 logger.info("Running predictions on query data")
 query.obs[par["output_obs_predictions"]] = scanvi_query.predict(query)
 query.obs[par["output_obs_probability"]] = np.max(scanvi_query.predict(query, soft=True), axis=1)
 logger.info("Saving output and model")
 input_data.mod[par["modality"]] = query
 input_data.write_h5mu(par["output"], compression=par["output_compression"])
 if par["output_model"]:
    scanvi_query.save(par["output_model"], overwrite=True)
--- a/src/annotate/scanvi/test.py
+++ b/src/annotate/scanvi/test.py
@@ -1,9 +1,9 @@
 import subprocess
 import sys
 import os
 import pytest
 import re
 import mudata as mu
 import anndata as ad
 from openpipelinetestutils.asserters import assert_annotation_objects_equal
 import scvi
 import os
@@ -16,6 +16,7 @@ meta = {
 input_file = f"{meta['resources_dir']}/pbmc_1k_protein_v3/pbmc_1k_protein_v3_mms.h5mu"
 reference_file = f"{meta['resources_dir']}/annotation_test_data/TS_Blood_filtered.h5mu"
@pytest.fixture
 def create_scvi_model(random_path, tmp_path):
    def wrapper(input_file, reference_file):
@@ -23,7 +24,7 @@ def create_scvi_model(random_path, tmp_path):
        input_modality = input_data.mod["rna"]
        reference_data = mu.read_h5mu(reference_file)
        reference_modality = reference_data.mod["rna"]
-        
+
        reference_data.var["gene_symbol"] = list(reference_data.var.index)
        reference_data.var.index = [re.sub("\\.[0-9]+$", "", s) for s in reference_data.var["ensemblid"]]
        reference_modality.var["gene_symbol"] = list(reference_modality.var.index)
@@ -47,30 +48,49 @@ def create_scvi_model(random_path, tmp_path):
            n_layers=1,
            )
        scvi_model.train(max_epochs=10)
-        
+
        input_data.mod["rna"] = query
-        reference_data.mod["rna"] = reference
+        # reference_data.mod["rna"] = reference
-        
+
        input_data_file = random_path(extension="h5mu")
-        reference_file = random_path(extension="h5mu")
+        # reference_file = random_path(extension="h5mu")
        scvi_model_file = tmp_path
-        
+
        input_data.write_h5mu(input_data_file)
-        reference_data.write_h5mu(reference_file)
+        # reference_data.write_h5mu(reference_file)
-        scvi_model.save(scvi_model_file, overwrite=True)
+        scvi_model.save(scvi_model_file, save_anndata=True, overwrite=True)
-                
+
-        return scvi_model_file, input_data_file, reference_file
+        return scvi_model_file, input_data_file
    return wrapper
@pytest.fixture
 def create_scanvi_model(create_scvi_model, tmp_path):
    def scanvi_wrapper():
        scvi_model_file, input_data_file = create_scvi_model(input_file, reference_file)
        scvi_model = scvi.model.SCVI.load(scvi_model_file)
        scanvi_model = scvi.model.SCANVI.from_scvi_model(
            scvi_model,
            unlabeled_category="Unkown",
            labels_key="cell_ontology_class",
            )
        scanvi_model.train(max_epochs=10)
        scanvi_model_file = tmp_path
        scanvi_model.save(scanvi_model_file, save_anndata=True, overwrite=True)
        return scanvi_model_file, input_data_file
    return scanvi_wrapper
 def test_simple_execution(run_component, random_h5mu_path, create_scvi_model):
-    scvi_model_file, input_file_scvi, reference_file_scvi = create_scvi_model(input_file, reference_file)
+    scvi_model_file, input_file_scvi = create_scvi_model(input_file, reference_file)
    output_file = random_h5mu_path()
    run_component([
        "--input", input_file_scvi,
        "--reference", reference_file_scvi,
        "--scvi_reference_model", scvi_model_file,
        "--reference_obs_label", "cell_ontology_class",
        "--reference_max_epochs", "10",
        "--query_max_epochs", "10",
        "--output", output_file
@@ -80,7 +100,7 @@ def test_simple_execution(run_component, random_h5mu_path, create_scvi_model):
    input_mudata = mu.read_h5mu(input_file_scvi)
    output_mudata = mu.read_h5mu(output_file)
-    
+
    assert input_mudata.mod["rna"].n_obs == output_mudata.mod["rna"].n_obs, f"Number of observations changed"
    assert input_mudata.mod["rna"].n_vars == output_mudata.mod["rna"].n_vars, f"Number of variables changed"
    assert "scanvi_embedding" in output_mudata.mod["rna"].obsm.keys(), "Latent representation not added"
@@ -89,16 +109,15 @@ def test_simple_execution(run_component, random_h5mu_path, create_scvi_model):
    assert_annotation_objects_equal(input_mudata.mod["prot"],
                                    output_mudata.mod["prot"])
-    
+
 def test_multiple_arguments(run_component, random_h5mu_path, create_scvi_model, tmp_path):
-    scvi_model_file, input_file_scvi, reference_file_scvi = create_scvi_model(input_file, reference_file)
+    scvi_model_file, input_file_scvi = create_scvi_model(input_file, reference_file)
    output_file = random_h5mu_path()
    run_component([
        "--input", input_file_scvi,
        "--reference", reference_file_scvi,
        "--scvi_reference_model", scvi_model_file,
        "--reference_obs_label", "cell_ontology_class",
        "--output", output_file,
        "--reference_max_epochs", "10",
        "--reference_reduce_lr_on_plateau", "True",
@@ -123,20 +142,69 @@ def test_multiple_arguments(run_component, random_h5mu_path, create_scvi_model,
    ])
    assert os.path.exists(output_file), "Output file does not exist"
-    assert os.path.exists(tmp_path / "model.pt"), "Model file does not exist" 
+    assert os.path.exists(tmp_path / "model.pt"), "Model file does not exist"
    input_mudata = mu.read_h5mu(input_file_scvi)
    output_mudata = mu.read_h5mu(output_file)
-    
+
    assert input_mudata.mod["rna"].n_obs == output_mudata.mod["rna"].n_obs, f"Number of observations changed"
    assert input_mudata.mod["rna"].n_vars == output_mudata.mod["rna"].n_vars, f"Number of variables changed"
    assert "scanvi_embedding" in output_mudata.mod["rna"].obsm.keys(), "Latent representation not added"
    assert "scanvi_pred" in output_mudata.mod["rna"].obs.keys(), "Predictions not added"
    assert "scanvi_probability" in output_mudata.mod["rna"].obs.keys(), "Probabilities not added"
    assert_annotation_objects_equal(input_mudata.mod["prot"],
                                    output_mudata.mod["prot"])
 def test_pretrained_scanvi(run_component, random_h5mu_path, create_scanvi_model):
    scanvi_model_file, input_file_scanvi = create_scanvi_model()
    output_file = random_h5mu_path()
    run_component([
        "--input", input_file_scanvi,
        "--scanvi_reference_model", scanvi_model_file,
        "--reference_obs_label", "cell_ontology_class",
        "--reference_max_epochs", "10",
        "--query_max_epochs", "10",
        "--output", output_file
    ])
    assert os.path.exists(output_file), "Output file does not exist"
    input_mudata = mu.read_h5mu(input_file_scanvi)
    output_mudata = mu.read_h5mu(output_file)
    assert input_mudata.mod["rna"].n_obs == output_mudata.mod["rna"].n_obs, f"Number of observations changed"
    assert input_mudata.mod["rna"].n_vars == output_mudata.mod["rna"].n_vars, f"Number of variables changed"
    assert "scanvi_embedding" in output_mudata.mod["rna"].obsm.keys(), "Latent representation not added"
    assert "scanvi_pred" in output_mudata.mod["rna"].obs.keys(), "Predictions not added"
    assert "scanvi_probability" in output_mudata.mod["rna"].obs.keys(), "Probabilities not added"
    assert_annotation_objects_equal(input_mudata.mod["prot"],
                                    output_mudata.mod["prot"])
 def test_raises(run_component, random_h5mu_path, create_scvi_model, create_scanvi_model):
    scvi_model_file, input_file_scvi = create_scvi_model(input_file, reference_file)
    scanvi_model_file, input_file_scanvi = create_scanvi_model()
    output_file = random_h5mu_path()
    with pytest.raises(subprocess.CalledProcessError) as err:
        run_component([
            "--input", input_file_scanvi,
            "--scanvi_reference_model", scanvi_model_file,
            "--scvi_reference_model", scvi_model_file,
            "--reference_obs_label", "cell_ontology_class",
            "--reference_max_epochs", "10",
            "--query_max_epochs", "10",
            "--output", output_file
        ])
    assert re.search(
        r"ValueError: Make sure to provide either an '--scvi_reference_model' or a '--scanvi_reference_model', but not both.",
        err.value.stdout.decode('utf-8')
        )
 if __name__ == '__main__':
-    sys.exit(pytest.main([__file__]))
+    sys.exit(pytest.main([__file__]))
--- a/src/annotate/utils/query_reference_allignment.py
+++ b/src/annotate/utils/query_reference_allignment.py
@@ -0,0 +1,46 @@
 import re
 import anndata as ad
 def setup_logger():
    import logging
    from sys import stdout
    logger = logging.getLogger()
    logger.setLevel(logging.INFO)
    console_handler = logging.StreamHandler(stdout)
    logFormatter = logging.Formatter("%(asctime)s %(levelname)-8s %(message)s")
    console_handler.setFormatter(logFormatter)
    logger.addHandler(console_handler)
    return logger
 # END TEMPORARY WORKAROUND setup_logger
 logger = setup_logger()
 # Helper functions
 def set_var_index(adata: ad.AnnData, var_name: str | None = None):
    if var_name:
        adata.var.index = [re.sub("\\.[0-9]+$", "", s) for s in adata.var[var_name]]
    else:
        adata.var.index = [re.sub("\\.[0-9]+$", "", s) for s in adata.var.index]
    return adata
 def cross_check_genes(query: ad.AnnData, reference: ad.AnnData):
    logger.info("Detecting common vars based on gene ids")
    common_ens_ids = list(set(reference.var.index).intersection(set(query.var.index)))
    logger.info("  reference n_vars: %i", reference.n_vars)
    logger.info("  input n_vars: %i", query.n_vars)
    logger.info("  intersect n_vars: %i", len(common_ens_ids))
    assert len(common_ens_ids) >= 100, "The intersection of genes between the query and reference dataset is too small."
    return common_ens_ids
 def subset_vars(adata: ad.AnnData, var_column: str | None = None):
    if var_column:
        return adata[:, adata.var[var_column]]
    else:
        return adata
--- a/src/filter/subset_obsp/config.vsh.yaml
+++ b/src/filter/subset_obsp/config.vsh.yaml
@@ -0,0 +1,75 @@
 name: subset_obsp
 namespace: "filter"
 description: |
  Create a subset of an .obsp field in a mudata file, by filtering the columns based on the values of an .obs column. The resulting subset is moved to an .obsm slot.
 authors:
  - __merge__: /src/authors/dorien_roosen.yaml
    roles: [ author, maintainer ]
 argument_groups:
  - name: Input
    arguments:
    - name: "--input"
      type: file
      description: Input h5mu file
      direction: input
      required: true
      example: input.h5mu
    - name: "--modality"
      type: string
      default: "rna"
      required: false
    - name: "--input_obsp_key"
      type: string
      required: true
      description: The .obsp field to be filtered.
    - name: "--input_obs_key"
      type: string
      required: true
      description: The .obs column to filter on.
    - name: "--input_obs_value"
      type: string
      required: true
      description: The value to filter on in the .obs column.
  - name: Output
    arguments:
    - name: "--output"
      type: file
      description: Output h5mu file.
      direction: output
      example: output.h5mu
    - name: "--output_obsm_key"
      type: string
      required: true
      description: The .obsm key to store the subset in.
    - name: "--output_compression"
      type: string
      description: The compression format to be used on the output h5mu object.
      choices: ["gzip", "lzf"]
      required: false
      example: "gzip"
 resources:
  - type: python_script
    path: script.py
  - path: /src/utils/setup_logger.py
 test_resources:
  - type: python_script
    path: test.py
  - path: /resources_test/pbmc_1k_protein_v3/pbmc_1k_protein_v3_mms.h5mu
 engines:
 - type: docker
  image: python:3.12-slim
  setup:
    - type: apt
      packages: 
        - procps
    - type: python
      __merge__: /src/base/requirements/anndata_mudata.yaml
  __merge__: [ /src/base/requirements/python_test_setup.yaml, .]
 runners:
 - type: executable
 - type: nextflow
  directives:
    label: [singlecpu, lowmem]
--- a/src/filter/subset_obsp/script.py
+++ b/src/filter/subset_obsp/script.py
@@ -0,0 +1,54 @@
 import mudata as mu
 ### VIASH START
 par = {
  'input': 'resources_test/pbmc_1k_protein_v3/pbmc_1k_protein_v3_mms.h5mu',
  'modality': 'rna',
  'input_obsp_key': 'distances',
  'input_obs_key': 'leiden',
  'input_obs_value': '1',
  'output_obsm_key': "leiden_1",
  'output': 'subset_obsp_output.h5mu',
  'output_compression': None,
 }
 ### VIASH END
 # START TEMPORARY WORKAROUND setup_logger
 # reason: resources aren't available when using Nextflow fusion
 # from setup_logger import setup_logger
 def setup_logger():
    import logging
    from sys import stdout
    logger = logging.getLogger()
    logger.setLevel(logging.INFO)
    console_handler = logging.StreamHandler(stdout)
    logFormatter = logging.Formatter("%(asctime)s %(levelname)-8s %(message)s")
    console_handler.setFormatter(logFormatter)
    logger.addHandler(console_handler)
    return logger
 # END TEMPORARY WORKAROUND setup_logger
 logger = setup_logger()
 def main():
    logger.info(f"Reading {par['input']}")
    mdata = mu.read_h5mu(par["input"])
    adata = mdata.mod[par["modality"]]
    logger.info(f"Subset columns of obsp matrix under {par['input_obsp_key']} based on {par['input_obs_key']} == {par['input_obs_value']}")
    # .obsp, .obs and .obsm index and .obsp columns all have a dimension length of `n_obs`
    # the index dimensions remain unaltered, but .obsp columns will be subset 
    obsp = adata.obsp[par["input_obsp_key"]]
    idx = adata.obs[par["input_obs_key"]].astype(str) == par["input_obs_value"]
    obsm_subset = obsp[:, idx]
    logger.info(f"Writing subset obsp matrix to .obsm {par['output_obsm_key']}")
    adata.obsm[par["output_obsm_key"]] = obsm_subset
    logger.info(f"Writing output to {par['output']}")
    mdata.write_h5mu(par["output"], compression=par["output_compression"])
 if __name__ == '__main__':
    main()
--- a/src/filter/subset_obsp/test.py
+++ b/src/filter/subset_obsp/test.py
@@ -0,0 +1,48 @@
 import sys
 import pytest
 import mudata as mu
 ## VIASH START
 meta = {
    'resources_dir': 'resources_test/pbmc_1k_protein_v3/'
 }
 ## VIASH END
@pytest.fixture
 def input_h5mu():
    input = mu.read_h5mu(f"{meta['resources_dir']}/pbmc_1k_protein_v3_mms.h5mu")
    input.mod["rna"].obs["filter_column"] = "group_2"
    input.mod["rna"].obs["filter_column"][:50] = "group_1"
    return input
@pytest.fixture
 def input_path(write_mudata_to_file, input_h5mu):
    return write_mudata_to_file(input_h5mu)
 def test_subset_obsp(input_path, run_component, tmp_path):
    output_path = tmp_path / "output.h5mu"
    # run component
    run_component([
        "--input", input_path,
        "--output", str(output_path),
        "--input_obsp_key", "distances",
        "--input_obs_key", "filter_column",
        "--input_obs_value", "group_1",
        "--output_obsm_key", "group_1"
    ])
    assert output_path.is_file(), "Output file not found"
    # check output file
    mu_out = mu.read_h5mu(output_path)
    assert "group_1" in mu_out.mod["rna"].obsm, "Output should contain group_1 in .obsm"
    assert mu_out.mod["rna"].obsm["group_1"].shape[1] == 50, "Obsm should only contain a subset of the original obsp matrix"
 if __name__ == "__main__":
    sys.exit(pytest.main([__file__]))
--- a/src/labels_transfer/pynndescent_knn/config.vsh.yaml
+++ b/src/labels_transfer/pynndescent_knn/config.vsh.yaml
@@ -1,7 +1,7 @@
-name: pynndescent_knn
+name: knn
 namespace: "labels_transfer"
 description: |
-  This component generates a neighborhood graph based using the PyNNDescentTransformer, followed by classification using a k-nearest neighborhood vote.
+  This component performs label transfer from reference to query using a K-Neirest Neighbors classifier.
 authors:
  - __merge__: /src/authors/dorien_roosen.yaml
    roles: [ maintainer, author ]
@@ -11,6 +11,27 @@ authors:
 __merge__: ../api/common_arguments.yaml
 argument_groups:
  - name: Input dataset (query) arguments
    arguments:
      - name: "--input_obsm_distances"
        type: string
        direction: input
        required: false
        example: bbknn_distances
        description: |
          The `.obsm` key of the input (query) dataset containing pre-calculated distances. 
          If not provided, the distances will be calculated using PyNNDescent.
          Make sure the distance matrix contains distances relative to the reference dataset and were obtained in the same way as the reference embedding.
  - name: Reference dataset arguments
    arguments:
      - name: "--reference_obsm_distances"
        type: string
        required: false
        description: |
          The `.obsm` key of the reference dataset containing pre-calculated distances. 
          If not provided, the distances will be calculated using PyNNDescent.
        example: bbknn_distances
  - name: KNN label transfer arguments
    arguments:
@@ -30,6 +51,7 @@ argument_groups:
        description: |
          The number of neighbors to use in k-neighbor graph structure used for fast approximate nearest neighbor search with PyNNDescent. 
          Larger values will result in more accurate search results at the cost of computation time.
 resources:
  - type: python_script
    path: script.py
@@ -56,9 +78,7 @@ engines:
        packages:
          - pynndescent~=0.5.10
          - numpy<2
-    test_setup:
+    __merge__: [ /src/base/requirements/python_test_setup.yaml, .]
      - type: python
        __merge__: [ /src/base/requirements/viashpy.yaml ]
 runners:
  - type: executable
--- a/src/labels_transfer/pynndescent_knn/script.py
+++ b/src/labels_transfer/pynndescent_knn/script.py
@@ -77,20 +77,46 @@ r_adata = r_mdata.mod[par["modality"]]
 logger.info("Checking arguments")
 par = check_arguments(par)
-# Generating training and inference data
+if par["input_obsm_distances"] and par["reference_obsm_distances"]:
-logger.info("Generating training and inference data")
+    logger.info("Using pre-calculated distances for KNN classification as provided in `--input_obsm_distances` and `--reference_obsm_distances`.")
 train_X = get_reference_features(r_adata, par, logger)
 inference_X = get_query_features(q_adata, par, logger)
-neighbors_transformer = PyNNDescentTransformer(
+    assert par["input_obsm_distances"] in q_adata.obsm, f"Make sure --input_obsm_distances {par['input_obsm_distances']} is a valid .obsm key. Found: {q_adata.obsm.keys()}."
-    n_neighbors=par["n_neighbors"],
+    assert par["reference_obsm_distances"] in r_adata.obsm, f"Make sure --reference_obsm_distances {par['reference_obsm_distances']} is a valid .obsm key. Found: {r_adata.obsm.keys()}."
    parallel_batch_queries=True,
 )
 neighbors_transformer.fit(train_X)
-# Square sparse matrix with distances to n neighbors in reference data
+    query_neighbors = q_adata.obsm[par["input_obsm_distances"]]
-reference_neighbors = neighbors_transformer.transform(inference_X)
+    reference_neighbors = r_adata.obsm[par["reference_obsm_distances"]]
-query_neighbors = neighbors_transformer.transform(train_X)
+
    if query_neighbors.shape[1] != reference_neighbors.shape[1]:
        raise ValueError("The number of neighbors in the query and reference distance matrices do not match. Make sure both distance matrices contain distances to the reference dataset.")
    # Make sure the number of neighbors present in the distance matrix matches the requested number of neighbors in --n_neighbors
    # Otherwise reduce n_neighbors for KNN
    smallest_neighbor_count = min(
        np.diff(query_neighbors.indptr).min(),
        np.diff(reference_neighbors.indptr).min()
    )
    if smallest_neighbor_count < par["n_neighbors"]:
        logger.warning(f"The number of neighbors in the distance matrices is smaller than the requested number of neighbors in --n_neighbors. Reducing n_neighbors to {smallest_neighbor_count} for KNN Classification")
        par["n_neighbors"] = smallest_neighbor_count
 elif par["input_obsm_distances"] or par["reference_obsm_distances"]:
    raise ValueError("Make sure to provide both --input_obsm_distances and --reference_obsm_distances if you want to use a pre-calculated distance matrix for KNN classification.")
 elif not par["input_obsm_distances"] and not par["reference_obsm_distances"]:
    logger.info("No pre-calculated distances were provided. Calculating distances using the PyNNDescent algorithm.")
    # Generating training and inference data
    train_X = get_reference_features(r_adata, par, logger)
    inference_X = get_query_features(q_adata, par, logger)
    neighbors_transformer = PyNNDescentTransformer(
        n_neighbors=par["n_neighbors"],
        parallel_batch_queries=True,
    )
    neighbors_transformer.fit(train_X)
    # Square sparse matrix with distances to n neighbors in reference data
    query_neighbors = neighbors_transformer.transform(inference_X)
    reference_neighbors = neighbors_transformer.transform(train_X)
 # For each target, train a classifier and predict labels
 for obs_tar, obs_pred, obs_proba in zip(par["reference_obs_targets"],  par["output_obs_predictions"], par["output_obs_probability"]):
@@ -104,10 +130,14 @@ for obs_tar, obs_pred, obs_proba in zip(par["reference_obs_targets"],  par["outp
    logger.info(f"Using KNN classifier with {par['weights']} weights")
    train_y = r_adata.obs[obs_tar].to_numpy()
-    classifier = KNeighborsClassifier(n_neighbors=par["n_neighbors"], metric="precomputed", weights=weights_dict[par["weights"]])
+    classifier = KNeighborsClassifier(
-    classifier.fit(X=query_neighbors, y=train_y)
+        n_neighbors=par["n_neighbors"],
-    predicted_labels = classifier.predict(reference_neighbors)
+        metric="precomputed",
-    probabilities = classifier.predict_proba(reference_neighbors).max(axis=1)
+        weights=weights_dict[par["weights"]]
        )
    classifier.fit(X=reference_neighbors, y=train_y)
    predicted_labels = classifier.predict(query_neighbors)
    probabilities = classifier.predict_proba(query_neighbors).max(axis=1)
    # save_results
    logger.info(f"Saving predictions to {obs_pred} and probabilities to {obs_proba} in obs")
--- a/src/labels_transfer/knn/test.py
+++ b/src/labels_transfer/knn/test.py
@@ -0,0 +1,155 @@
 import re
 import subprocess
 import pytest
 from pathlib import Path
 import anndata as ad
 import mudata as mu
 import numpy as np
 from scipy.sparse import csr_matrix
 ## VIASH START
 meta = {
    'resources_dir': './resources_test/'
 }
 ## VIASH END
 reference_h5ad_file = f"{meta['resources_dir']}/annotation_test_data/TS_Blood_filtered.h5ad"
 # convert reference to h5mu
 reference_adata = ad.read_h5ad(reference_h5ad_file)
 reference_mdata = mu.MuData({"rna": reference_adata})
 reference_file = f"{meta['resources_dir']}/annotation_test_data/TS_Blood_filtered.h5mu"
 reference_mdata.write_h5mu(reference_file)
 input_file = f"{meta['resources_dir']}/pbmc_1k_protein_v3/pbmc_1k_protein_v3_filtered_feature_bc_matrix.h5mu"
 def test_label_transfer(run_component, random_h5mu_path):
    output = random_h5mu_path()
    args = [
        "--input", input_file,
        "--modality", "rna",
        "--reference", reference_file,
        "--reference_obs_targets", "cell_type",
        "--output", output,
        "--n_neighbors", "5"
    ]
    run_component(args)
    assert Path(output).is_file()
    output_data = mu.read_h5mu(output)
    assert "cell_type_pred" in output_data.mod["rna"].obs, f"Predictions cell_type_pred is missing from output\noutput: {output_data.mod['rna'].obs}"
    assert "cell_type_probability" in output_data.mod["rna"].obs, f"Uncertainties cell_type_probability is missing from output\noutput: {output_data.mod['rna'].obs}"
@pytest.mark.parametrize("weights", ["uniform", "distance", "gaussian"])
 def test_label_transfer_prediction_columns(run_component, weights, random_h5mu_path):
    output = random_h5mu_path()
    args = [
        "--input", input_file,
        "--modality", "rna",
        "--reference", reference_file,
        "--reference_obs_targets", "cell_type",
        "--weights", weights,
        "--output", output,
        "--output_obs_probability", "test_probability",
        "--output_obs_predictions", "test_prediction",
        "--n_neighbors", "5"
    ]
    run_component(args)
    assert Path(output).is_file()
    output_data = mu.read_h5mu(output)
    assert "test_prediction" in output_data.mod["rna"].obs, f"Predictions test_prediction is missing from output\noutput: {output_data.mod['rna'].obs}"
    assert "test_probability" in output_data.mod["rna"].obs, f"Uncertainties test_probability is missing from output\noutput: {output_data.mod['rna'].obs}"
 def test_label_transfer_prediction_precomputed_neighbor_graph(run_component, random_h5mu_path):
    output = random_h5mu_path()
    # Add mock distance matrix to obsm slot
    reference_mdata = mu.read_h5mu(reference_file)
    ref_distances = np.random.rand(400, 400)
    ref_distances[ref_distances < 0.5] = 0
    ref_distances = csr_matrix(ref_distances)
    reference_mdata.mod["rna"].obsm["distances"] = ref_distances
    reference_mdata.write_h5mu(reference_file)
    query_mdata = mu.read_h5mu(input_file)
    query_distances = np.random.rand(713, 400)
    query_distances[query_distances < 0.5] = 0
    query_distances = csr_matrix(query_distances)
    query_mdata.mod["rna"].obsm["distances"] = query_distances
    query_mdata.write_h5mu(input_file)
    args = [
        "--input", input_file,
        "--modality", "rna",
        "--reference", reference_file,
        "--reference_obs_targets", "cell_type",
        "--output", output,
        "--input_obsm_distances", "distances",
        "--reference_obsm_distances", "distances",
        "--output_obs_probability", "test_probability",
        "--output_obs_predictions", "test_prediction",
        "--n_neighbors", "5"
    ]
    run_component(args)
    assert Path(output).is_file()
    output_data = mu.read_h5mu(output)
    assert "test_prediction" in output_data.mod["rna"].obs, f"Predictions test_prediction is missing from output\noutput: {output_data.mod['rna'].obs}"
    assert "test_probability" in output_data.mod["rna"].obs, f"Uncertainties test_probability is missing from output\noutput: {output_data.mod['rna'].obs}"
 def test_raises_distance_matrix_dimensions(run_component, random_h5mu_path):
    output = random_h5mu_path()
    reference_mdata = mu.read_h5mu(reference_file)
    ref_distances = np.random.rand(400, 100)
    ref_distances[ref_distances < 0.5] = 0
    ref_distances = csr_matrix(ref_distances)
    reference_mdata.mod["rna"].obsm["distances"] = ref_distances
    reference_mdata.write_h5mu(reference_file)
    query_mdata = mu.read_h5mu(input_file)
    query_distances = np.random.rand(713, 400)
    query_distances[query_distances < 0.5] = 0
    query_distances = csr_matrix(query_distances)
    query_mdata.mod["rna"].obsm["distances"] = query_distances
    query_mdata.write_h5mu(input_file)
    with pytest.raises(subprocess.CalledProcessError) as err:
        run_component([
            "--input", input_file,
            "--modality", "rna",
            "--reference", reference_file,
            "--reference_obs_targets", "cell_type",
            "--output", output,
            "--input_obsm_distances", "distances",
            "--reference_obsm_distances", "distances",
            "--output_obs_probability", "test_probability",
            "--output_obs_predictions", "test_prediction",
            "--n_neighbors", "5"
        ])
    assert re.search(
        r"ValueError: The number of neighbors in the query and reference distance matrices do not match. Make sure both distance matrices contain distances to the reference dataset.",
        err.value.stdout.decode('utf-8')
        )
 if __name__ == '__main__':
    exit(pytest.main([__file__]))
--- a/src/labels_transfer/pynndescent_knn/test.py
+++ b/src/labels_transfer/pynndescent_knn/test.py
@@ -1,70 +0,0 @@
 import pytest
 from pathlib import Path
 import anndata as ad
 import mudata as mu
 ## VIASH START
 meta = {
    'resources_dir': './resources_test/'
 }
 ## VIASH END
 reference_h5ad_file = f"{meta['resources_dir']}/annotation_test_data/TS_Blood_filtered.h5ad"
 # convert reference to h5mu
 reference_adata = ad.read_h5ad(reference_h5ad_file)
 reference_mdata = mu.MuData({"rna": reference_adata})
 reference_file = f"{meta['resources_dir']}/annotation_test_data/TS_Blood_filtered.h5mu"
 reference_mdata.write_h5mu(reference_file)
 input_file = f"{meta['resources_dir']}/pbmc_1k_protein_v3/pbmc_1k_protein_v3_filtered_feature_bc_matrix.h5mu"
 def test_label_transfer(run_component):
    args = [
        "--input", input_file,
        "--modality", "rna",
        "--reference", reference_file,
        "--reference_obs_targets", "cell_type",
        "--output", "output.h5mu",
        "--n_neighbors", "5"
    ]
    run_component(args)
    assert Path("output.h5mu").is_file()
    output_data = mu.read_h5mu("output.h5mu")
    assert "cell_type_pred" in output_data.mod["rna"].obs, f"Predictions cell_type_pred is missing from output\noutput: {output_data.mod['rna'].obs}"
    assert "cell_type_probability" in output_data.mod["rna"].obs, f"Uncertainties cell_type_probability is missing from output\noutput: {output_data.mod['rna'].obs}"
@pytest.mark.parametrize("weights", ["uniform", "distance", "gaussian"])
 def test_label_transfer_prediction_columns(run_component, weights):
    output = f"output_{weights}.h5mu"
    args = [
        "--input", input_file,
        "--modality", "rna",
        "--reference", reference_file,
        "--reference_obs_targets", "cell_type",
        "--weights", weights,
        "--output", output,
        "--output_obs_probability", "test_probability",
        "--output_obs_predictions", "test_prediction",
        "--n_neighbors", "5"
    ]
    run_component(args)
    assert Path(output).is_file()
    output_data = mu.read_h5mu(output)
    assert "test_prediction" in output_data.mod["rna"].obs, f"Predictions test_prediction is missing from output\noutput: {output_data.mod['rna'].obs}"
    assert "test_probability" in output_data.mod["rna"].obs, f"Uncertainties test_probability is missing from output\noutput: {output_data.mod['rna'].obs}"
 if __name__ == '__main__':
    exit(pytest.main([__file__]))
--- a/src/reference/build_bdrhap_reference/config.vsh.yaml
+++ b/src/reference/build_bdrhap_reference/config.vsh.yaml
@@ -114,7 +114,7 @@ resources:
  - path: make_rhap_reference_2.2.1_nodocker.cwl
 test_resources:
  - type: bash_script
-    path: run_test.sh
+    path: test.sh
  - path: /resources_test/reference_gencodev41_chr1/reference.fa.gz
  - path: /resources_test/reference_gencodev41_chr1/reference.gtf.gz
--- a/src/reference/build_bdrhap_reference/run_test.sh
+++ b/src/reference/build_bdrhap_reference/run_test.sh
@@ -1,6 +1,6 @@
 #!/bin/bash
-
+set -eou pipefail
 ## VIASH START
 meta_executable="bin/viash run src/reference/make_reference/config.vsh.yaml --"
--- a/src/reference/build_cellranger_arc_reference/config.vsh.yaml
+++ b/src/reference/build_cellranger_arc_reference/config.vsh.yaml
@@ -53,7 +53,7 @@ resources:
    path: script.sh
 test_resources:
  - type: bash_script
-    path: run_test.sh
+    path: test.sh
  - path: /resources_test/reference_gencodev41_chr1
 engines:
  - type: docker
--- a/src/reference/build_cellranger_arc_reference/script.sh
+++ b/src/reference/build_cellranger_arc_reference/script.sh
@@ -1,6 +1,6 @@
 #!/bin/bash
-set -eou pipefail
+set -eo pipefail
 ## VIASH START
 par_genome_fasta="resources_test/reference_gencodev41_chr1/reference.fa.gz"
--- a/src/reference/build_cellranger_arc_reference/run_test.sh
+++ b/src/reference/build_cellranger_arc_reference/run_test.sh
--- a/src/reference/build_cellranger_reference/config.vsh.yaml
+++ b/src/reference/build_cellranger_reference/config.vsh.yaml
@@ -33,7 +33,7 @@ resources:
    path: script.sh
 test_resources:
  - type: bash_script
-    path: run_test.sh
+    path: test.sh
  - path: /resources_test/reference_gencodev41_chr1
 engines:
--- a/src/reference/build_cellranger_reference/script.sh
+++ b/src/reference/build_cellranger_reference/script.sh
@@ -1,6 +1,6 @@
 #!/bin/bash
-set -eou pipefail
+set -eo pipefail
 ## VIASH START
 par_genome_fasta="resources_test/reference_gencodev41_chr1/reference.fa.gz"
--- a/src/reference/build_cellranger_reference/run_test.sh
+++ b/src/reference/build_cellranger_reference/run_test.sh
--- a/src/reference/build_star_reference/test.sh
+++ b/src/reference/build_star_reference/test.sh
@@ -1,4 +1,6 @@
-# set -eo pipefail
+#!/bin/bash
 set -eou pipefail
 ## VIASH START
 meta_resources_dir="./resources_test"
--- a/src/reference/cellranger_mkgtf/config.vsh.yaml
+++ b/src/reference/cellranger_mkgtf/config.vsh.yaml
@@ -27,8 +27,6 @@ resources:
  - type: bash_script
    path: script.sh
 test_resources:
  # - type: bash_script
  #   path: run_test.sh
  - type: python_script
    path: test.py
  - path: /resources_test/reference_gencodev41_chr1
--- a/src/reference/cellranger_mkgtf/run_test.sh
+++ b/src/reference/cellranger_mkgtf/run_test.sh
@@ -1,46 +0,0 @@
 #!/bin/bash
 set -eou pipefail
 ## VIASH START
 meta_executable="bin/viash run src/reference/cellranger_mkgtf/config.vsh.yaml --"
 ## VIASH END
 # create temporary directory
 tmpdir=$(mktemp -d "$meta_temp_dir/$meta_name-XXXXXXXX")
 function clean_up {
    rm -rf "$tmpdir"
 }
 trap clean_up EXIT
 zcat "$meta_resources_dir/reference_gencodev41_chr1/reference.gtf.gz" | awk '$4 < 50001 {print ;}' | gzip > "$tmpdir/reference_small.gtf.gz"
 expected_gene_types=("transcribed_unprocessed_pseudogene" "miRNA")
 attribute_values=$(printf 'gene_type:%s,' "${expected_gene_types[@]}")
 attribute_values=${attribute_values%,}  # remove trailing comma
 echo $attribute_values
 echo "> Running $meta_name, writing to $tmpdir."
 $meta_executable \
  --input_gtf "$tmpdir/reference_small.gtf.gz" \
  --output_gtf "$tmpdir/myreference_filtered.gtf.gz" \
  --attribute "$attribute_values" \
  ---cpus ${meta_memory_gb:-1} \
  ---memory ${meta_memory_gb:-2}GB
 exit_code=$?
 [[ $exit_code != 0 ]] && echo "Non zero exit code: $exit_code" && exit 1
 echo ">> Checking whether output can be found"
 [[ ! -f "$tmpdir/myreference_filtered.gtf.gz" ]] && echo "Output gtf file could not be found!" && exit 1
 echo ">> Checking attribute 'gene_type' in output gtf file"
 unique_gene_types=$(zcat "$tmpdir/myreference_filtered.gtf.gz" | awk -F'\t' '$9 ~ /gene_type/ { split($9, a, ";"); for(i in a) if(a[i] ~ /gene_type/) print a[i] }' | sed 's/.*gene_type "\(.*\)".*/\1/' | sort -u)
 echo "Expected gene types: ${expected_gene_types[@]}"
 echo "Unique gene types: $unique_gene_types"
 if [[ "${#expected_gene_types[@]}" != "$(echo "$unique_gene_types" | wc -w)" ]]; then
  echo "Error: Not all expected gene types were found in the output gtf file"
  exit 1
 fi
 echo "> Test succeeded!"
--- a/src/reference/cellranger_mkgtf/script.sh
+++ b/src/reference/cellranger_mkgtf/script.sh
@@ -1,6 +1,6 @@
 #!/bin/bash
-set -eou pipefail
+set -eo pipefail
 ## VIASH START
 par_input_gtf="resources_test/reference_gencodev41_chr1/reference.gtf.gz"
--- a/src/reference/make_reference/config.vsh.yaml
+++ b/src/reference/make_reference/config.vsh.yaml
@@ -49,7 +49,7 @@ resources:
    path: script.sh
 test_resources:
  - type: bash_script
-    path: run_test.sh
+    path: test.sh
 engines:
  - type: docker
    image: ubuntu:22.04
--- a/src/reference/make_reference/run_test.sh
+++ b/src/reference/make_reference/run_test.sh
@@ -1,42 +0,0 @@
 #!/bin/bash
 set -eou pipefail
 ## VIASH START
 meta_executable="bin/viash run src/reference/make_reference/config.vsh.yaml --"
 ## VIASH END
 echo "> Running $meta_name."
 fasta="myreference.fa.gz"
 gtf="myreference.gtf.gz"
 wget https://ftp.ensembl.org/pub/release-109/fasta/homo_sapiens/dna/Homo_sapiens.GRCh38.dna.chromosome.1.fa.gz
 wget https://ftp.ensembl.org/pub/release-109/gtf/homo_sapiens/Homo_sapiens.GRCh38.109.chr.gtf.gz
 wget https://assets.thermofisher.com/TFS-Assets/LSG/manuals/ERCC92.zip
 $meta_executable \
  --genome_fasta "Homo_sapiens.GRCh38.dna.chromosome.1.fa.gz" \
  --transcriptome_gtf "Homo_sapiens.GRCh38.109.chr.gtf.gz" \
  --ercc "ERCC92.zip" \
  --subset_regex "(ERCC-00002|1)" \
  --output_fasta $fasta \
  --output_gtf $gtf
 exit_code=$?
 [[ $exit_code != 0 ]] && echo "Non zero exit code: $exit_code" && exit 1
 echo ">> Checking whether output can be found"
 [[ ! -f $fasta ]] && echo "Output fasta file could not be found!" && exit 1
 [[ ! -f $gtf ]] && echo "Output gtf file could not be found!" && exit 1
 echo ">> Checking contents of fasta"
 if ! zgrep -q '>1' $fasta; then
  echo "Could not find chromosome '1' in output reference!"
  exit 1
 fi
 if ! zgrep -q '>ERCC-00002' $fasta; then
  echo "Could not find ERCC-00002 in output reference!"
  exit 1
 fi
 echo "> Test succeeded!"
--- a/src/reference/make_reference/script.sh
+++ b/src/reference/make_reference/script.sh
@@ -1,6 +1,6 @@
 #!/bin/bash
-set -eou pipefail
+set -eo pipefail
 ## VIASH START
 par_genome_fasta="https://ftp.ebi.ac.uk/pub/databases/gencode/Gencode_human/release_41/GRCh38.primary_assembly.genome.fa.gz"
--- a/src/reference/make_reference/test.sh
+++ b/src/reference/make_reference/test.sh
@@ -0,0 +1,80 @@
 #!/bin/bash
 set -eo pipefail
 ## VIASH START
 meta_executable="bin/viash run src/reference/make_reference/config.vsh.yaml --"
 ## VIASH END
 # Fetch test data
 echo ">> Fetching test data"
 wget https://ftp.ensembl.org/pub/release-109/fasta/homo_sapiens/dna/Homo_sapiens.GRCh38.dna.chromosome.1.fa.gz
 wget https://ftp.ensembl.org/pub/release-109/gtf/homo_sapiens/Homo_sapiens.GRCh38.109.chr.gtf.gz
 wget https://assets.thermofisher.com/TFS-Assets/LSG/manuals/ERCC92.zip
 # Test 1
 echo ">> Test1"
 mkdir test1
 pushd test1
 fasta="myreference.fa.gz"
 gtf="myreference.gtf.gz"
 "$meta_executable" \
  --genome_fasta "../Homo_sapiens.GRCh38.dna.chromosome.1.fa.gz" \
  --transcriptome_gtf "../Homo_sapiens.GRCh38.109.chr.gtf.gz" \
  --ercc "../ERCC92.zip" \
  --subset_regex "(ERCC-00002|1)" \
  --output_fasta $fasta \
  --output_gtf $gtf
 exit_code=$?
 [[ $exit_code != 0 ]] && echo "Non zero exit code: $exit_code" && exit 1
 echo ">> Checking whether output can be found"
 [[ ! -f $fasta ]] && echo "Output fasta file could not be found!" && exit 1
 [[ ! -f $gtf ]] && echo "Output gtf file could not be found!" && exit 1
 echo ">> Checking contents of fasta"
 if ! zgrep -q '>1' $fasta; then
  echo "Could not find chromosome '1' in output reference!"
  exit 1
 fi
 if ! zgrep -q '>ERCC-00002' $fasta; then
  echo "Could not find ERCC-00002 in output reference!"
  exit 1
 fi
 popd
 # Test 2
 echo ">> Test 2"
 mkdir test2
 pushd test2
 fasta="myreference.fa.gz"
 gtf="myreference.gtf.gz"
 "$meta_executable" \
  --genome_fasta "../Homo_sapiens.GRCh38.dna.chromosome.1.fa.gz" \
  --transcriptome_gtf "../Homo_sapiens.GRCh38.109.chr.gtf.gz" \
  --output_fasta $fasta \
  --output_gtf $gtf
 exit_code=$?
 [[ $exit_code != 0 ]] && echo "Non zero exit code: $exit_code" && exit 1
 echo ">> Checking whether output can be found"
 [[ ! -f $fasta ]] && echo "Output fasta file could not be found!" && exit 1
 [[ ! -f $gtf ]] && echo "Output gtf file could not be found!" && exit 1
 echo ">> Checking contents of fasta"
 if ! zgrep -q '>1' $fasta; then
  echo "Could not find chromosome '1' in output reference!"
  exit 1
 fi
 if zgrep -q '>ERCC-00002' $fasta; then
  echo "Should not find ERCC-00002 in output reference!"
  exit 1
 fi
 popd
 echo "> Test succeeded!"
--- a/src/workflows/gdo/gdo_singlesample/integration_test.sh
+++ b/src/workflows/gdo/gdo_singlesample/integration_test.sh
@@ -1,22 +1,16 @@
 #!/bin/bash
 # get the root of the directory
 REPO_ROOT=$(git rev-parse --show-toplevel)
 # ensure that the command below is run from the root of the repository
 cd "$REPO_ROOT"
-export NXF_VER=21.10.6
+nextflow \
-
+  run . \
 viash ns build -q gdo_singlesample
 nextflow run . \
  -main-script src/workflows/gdo/gdo_singlesample/test.nf \
  -profile docker,no_publish \
  -entry test_wf \
-  -with-trace work/trace.txt \
+  -profile docker,no_publish \
  -c src/workflows/utils/labels_ci.config \
  -c src/workflows/utils/integration_tests.config
--- a/src/workflows/gdo/gdo_singlesample/main.nf
+++ b/src/workflows/gdo/gdo_singlesample/main.nf
@@ -43,8 +43,7 @@ workflow run_wf {
        ]
        return newState
      },
-      toState: ["output": "output"],
+      toState: ["output": "output"]
      auto: [ publish: true ]
    )
    | setState(["output"])
  emit:
--- a/src/workflows/gdo/gdo_singlesample/test.nf
+++ b/src/workflows/gdo/gdo_singlesample/test.nf
@@ -5,13 +5,13 @@ include { gdo_singlesample } from params.rootDir + "/target/nextflow/workflows/g
 params.resources_test = params.rootDir + "/resources_test"
 workflow test_wf {
-  // allow changing the resources_test dir
+
-  resources_test = file("${params.rootDir}/resources_test")
+  resources_test = file(params.resources_test)
  output_ch = Channel.fromList([
      [
        id: "simple_execution_test",
-        input: file(params.resources_test).resolve("10x_5k_lung_crispr/SC3_v3_NextGem_DI_CRISPR_A549_5K.h5mu"),
+        input: resources_test.resolve("10x_5k_lung_crispr/SC3_v3_NextGem_DI_CRISPR_A549_5K.h5mu"),
        min_counts: 3,
        max_counts: 10000000,
        min_guides_per_cell: 2,
--- a/src/workflows/ingestion/bd_rhapsody/integration_test.sh
+++ b/src/workflows/ingestion/bd_rhapsody/integration_test.sh
@@ -1,22 +1,16 @@
 #!/bin/bash
 # get the root of the directory
 REPO_ROOT=$(git rev-parse --show-toplevel)
 # ensure that the command below is run from the root of the repository
 cd "$REPO_ROOT"
 export NXF_VER=21.10.6
 nextflow \
  run . \
  -main-script src/workflows/ingestion/bd_rhapsody/test.nf \
  -entry test_wf \
  -resume \
  -profile docker,no_publish \
  -c src/workflows/utils/labels_ci.config \
-  -c src/workflows/utils/integration_tests.config \
+  -c src/workflows/utils/integration_tests.config 
  -with-trace work/trace.txt
--- a/src/workflows/ingestion/bd_rhapsody/test.nf
+++ b/src/workflows/ingestion/bd_rhapsody/test.nf
@@ -6,16 +6,16 @@ include { bd_rhapsody_test } from params.rootDir + "/target/nextflow/test_workfl
 params.resources_test = params.rootDir + "/resources_test"
 workflow test_wf {
-  // allow changing the resources_test dir
+
-  resources_test = file("${params.rootDir}/resources_test")
+  resources_test = file(params.resources_test)
  output_ch = Channel.fromList(
    [
      [
        id: "foo",
        reads: file("${params.resources_test}/bdrhap_5kjrt/raw/12*.fastq.gz"),
-        reference_archive: file(params.resources_test).resolve("reference_gencodev41_chr1/reference_bd_rhapsody.tar.gz"),
+        reference_archive: resources_test.resolve("reference_gencodev41_chr1/reference_bd_rhapsody.tar.gz"),
-        abseq_reference: file(params.resources_test).resolve("bdrhap_5kjrt/raw/BDAbSeq_ImmuneDiscoveryPanel.fasta"),
+        abseq_reference: resources_test.resolve("bdrhap_5kjrt/raw/BDAbSeq_ImmuneDiscoveryPanel.fasta"),
        cell_calling_data: "mRNA",
        exact_cell_count: 4900
      ] 
--- a/src/workflows/ingestion/cellranger_mapping/integration_test.sh
+++ b/src/workflows/ingestion/cellranger_mapping/integration_test.sh
@@ -1,23 +1,15 @@
 #!/bin/bash
 # get the root of the directory
 REPO_ROOT=$(git rev-parse --show-toplevel)
 # ensure that the command below is run from the root of the repository
 cd "$REPO_ROOT"
 viash ns build -q ingestion/cellranger_mapping --setup cb --platform nextflow
 export NXF_VER=21.10.6
 nextflow \
  run . \
  -main-script src/workflows/ingestion/cellranger_mapping/test.nf \
  -entry test_wf \
  -resume \
  -profile docker,no_publish \
  -c src/workflows/utils/labels_ci.config \
-  -c src/workflows/utils/integration_tests.config \
+  -c src/workflows/utils/integration_tests.config
  -with-trace work/trace.txt
--- a/src/workflows/ingestion/cellranger_mapping/main.nf
+++ b/src/workflows/ingestion/cellranger_mapping/main.nf
@@ -18,8 +18,7 @@ workflow run_wf {
      toState: [
        "input": "output",
        "output_raw": "output"
-      ],
+      ]
      auto: [ publish: true ]
    )
    // split output dir into map
    | cellranger_count_split.run(
@@ -49,14 +48,9 @@ workflow run_wf {
          "input_metrics_summary": state.metrics_summary
        ]
      },
-      toState: { id, output, state ->
+      toState: ["output_h5mu": "output"]
        [
          "output_raw": state.output_raw,
          "output_h5mu": output.output
        ]
      },
      auto: [ publish: true ],
    )
    | setState(["output_raw", "output_h5mu"])
  emit:
  output_ch
--- a/src/workflows/ingestion/cellranger_mapping/test.nf
+++ b/src/workflows/ingestion/cellranger_mapping/test.nf
@@ -7,11 +7,13 @@ params.resources_test = params.rootDir + "/resources_test"
 workflow test_wf {
  resources_test = file(params.resources_test)
  output_ch = Channel.fromList([
      [  
        id: "foo",
-        input: file(params.resources_test).resolve("cellranger_tiny_fastq/cellranger_tiny_fastq"),
+        input: resources_test.resolve("cellranger_tiny_fastq/cellranger_tiny_fastq"),
-        reference: file(params.resources_test).resolve("cellranger_tiny_fastq/cellranger_tiny_ref"),
+        reference: resources_test.resolve("cellranger_tiny_fastq/cellranger_tiny_ref"),
        output_type: "filtered",
      ]
    ])
--- a/src/workflows/ingestion/cellranger_multi/integration_test.sh
+++ b/src/workflows/ingestion/cellranger_multi/integration_test.sh
@@ -1,32 +1,23 @@
 #!/bin/bash
 # get the root of the directory
 REPO_ROOT=$(git rev-parse --show-toplevel)
 # ensure that the command below is run from the root of the repository
 cd "$REPO_ROOT"
 export NXF_VER=22.10.3
 nextflow \
  run . \
  -main-script src/workflows/ingestion/cellranger_multi/test.nf \
  -entry test_wf \
-  -resume \
+  -profile docker,no_publish \
  -profile no_publish,docker \
  -c src/workflows/utils/labels_ci.config \
-  -c src/workflows/utils/integration_tests.config \
+  -c src/workflows/utils/integration_tests.config
  -with-trace work/trace.txt
 nextflow \
  run . \
  -main-script src/workflows/ingestion/cellranger_multi/test.nf \
  -entry test_wf2 \
-  -resume \
+  -profile docker,no_publish \
  -profile no_publish,docker \
  -c src/workflows/utils/labels_ci.config \
-  -c src/workflows/utils/integration_tests.config \
+  -c src/workflows/utils/integration_tests.config
  -with-trace work/trace.txt
--- a/src/workflows/ingestion/cellranger_multi/test.nf
+++ b/src/workflows/ingestion/cellranger_multi/test.nf
@@ -7,20 +7,32 @@ params.resources_test = params.rootDir + "/resources_test"
 workflow test_wf {
  resources_test = file(params.resources_test)
  output_ch = Channel.fromList([
      [
        id: "foo",
-        input:[file(params.resources_test).resolve("10x_5k_anticmv/raw/5k_human_antiCMV_T_TBNK_connect_GEX_1_subset_S1_L001_R1_001.fastq.gz"),
+        input:[
-               file(params.resources_test).resolve("10x_5k_anticmv/raw/5k_human_antiCMV_T_TBNK_connect_GEX_1_subset_S1_L001_R2_001.fastq.gz"),
+          resources_test.resolve("10x_5k_anticmv/raw/5k_human_antiCMV_T_TBNK_connect_GEX_1_subset_S1_L001_R1_001.fastq.gz"),
-               file(params.resources_test).resolve("10x_5k_anticmv/raw/5k_human_antiCMV_T_TBNK_connect_AB_subset_S2_L004_R1_001.fastq.gz"),
+          resources_test.resolve("10x_5k_anticmv/raw/5k_human_antiCMV_T_TBNK_connect_GEX_1_subset_S1_L001_R2_001.fastq.gz"),
-               file(params.resources_test).resolve("10x_5k_anticmv/raw/5k_human_antiCMV_T_TBNK_connect_AB_subset_S2_L004_R2_001.fastq.gz"),
+          resources_test.resolve("10x_5k_anticmv/raw/5k_human_antiCMV_T_TBNK_connect_AB_subset_S2_L004_R1_001.fastq.gz"),
-               file(params.resources_test).resolve("10x_5k_anticmv/raw/5k_human_antiCMV_T_TBNK_connect_VDJ_subset_S1_L001_R1_001.fastq.gz"),
+          resources_test.resolve("10x_5k_anticmv/raw/5k_human_antiCMV_T_TBNK_connect_AB_subset_S2_L004_R2_001.fastq.gz"),
-               file(params.resources_test).resolve("10x_5k_anticmv/raw/5k_human_antiCMV_T_TBNK_connect_VDJ_subset_S1_L001_R2_001.fastq.gz")],
+          resources_test.resolve("10x_5k_anticmv/raw/5k_human_antiCMV_T_TBNK_connect_VDJ_subset_S1_L001_R1_001.fastq.gz"),
-        gex_reference: file(params.resources_test).resolve("reference_gencodev41_chr1/reference_cellranger.tar.gz"),
+          resources_test.resolve("10x_5k_anticmv/raw/5k_human_antiCMV_T_TBNK_connect_VDJ_subset_S1_L001_R2_001.fastq.gz")
-        vdj_reference: file(params.resources_test).resolve("10x_5k_anticmv/raw/refdata-cellranger-vdj-GRCh38-alts-ensembl-7.0.0.tar.gz"),
+        ],
-        feature_reference: file(params.resources_test).resolve("10x_5k_anticmv/raw/feature_reference.csv"),
+        gex_reference: resources_test.resolve("reference_gencodev41_chr1/reference_cellranger.tar.gz"),
-        library_id: ["5k_human_antiCMV_T_TBNK_connect_GEX_1_subset", "5k_human_antiCMV_T_TBNK_connect_AB_subset", "5k_human_antiCMV_T_TBNK_connect_VDJ_subset"],
+        vdj_reference: resources_test.resolve("10x_5k_anticmv/raw/refdata-cellranger-vdj-GRCh38-alts-ensembl-7.0.0.tar.gz"),
-        library_type: ["Gene Expression", "Antibody Capture", "VDJ"]
+        feature_reference: resources_test.resolve("10x_5k_anticmv/raw/feature_reference.csv"),
        library_id: [
          "5k_human_antiCMV_T_TBNK_connect_GEX_1_subset",
          "5k_human_antiCMV_T_TBNK_connect_AB_subset",
          "5k_human_antiCMV_T_TBNK_connect_VDJ_subset"
        ],
        library_type: [
          "Gene Expression",
          "Antibody Capture",
          "VDJ"
        ]
      ]
    ])
    | map{ state -> [state.id, state] }
@@ -44,8 +56,9 @@ workflow test_wf {
 }
 workflow test_wf2 {
-  // Test cell multiplexing
+
-  resources_test = file("${params.rootDir}/resources_test")
+  resources_test = file(params.resources_test)
  output_ch = Channel.fromList([
      [
        id: "foo",
--- a/src/workflows/ingestion/cellranger_postprocessing/integration_test.sh
+++ b/src/workflows/ingestion/cellranger_postprocessing/integration_test.sh
@@ -1,7 +1,5 @@
 #!/bin/bash
 # get the root of the directory
 REPO_ROOT=$(git rev-parse --show-toplevel)
@@ -14,9 +12,7 @@ nextflow \
  -entry test_wf \
  -profile docker,no_publish \
  -c src/workflows/utils/labels_ci.config \
-  -c src/workflows/utils/integration_tests.config \
+  -c src/workflows/utils/integration_tests.config
  -with-trace work/trace.txt \
  -resume
 nextflow \
  run . \
@@ -24,6 +20,4 @@ nextflow \
  -entry test_wf2 \
  -profile docker,no_publish \
  -c src/workflows/utils/labels_ci.config \
-  -c src/workflows/utils/integration_tests.config \
+  -c src/workflows/utils/integration_tests.config
  -with-trace work/trace.txt \
  -resume
--- a/src/workflows/ingestion/cellranger_postprocessing/main.nf
+++ b/src/workflows/ingestion/cellranger_postprocessing/main.nf
@@ -4,9 +4,10 @@ workflow run_wf {
  main:
  // perform correction if so desired
-  mid1_corrected = input_ch
+
-    | filter{ it[1].perform_correction }
+  output_ch = input_ch
    | cellbender_remove_background.run(
      runIf: {id, state -> state.perform_correction},
      fromState: { id, state ->
        [
          input: state.input,
@@ -16,17 +17,13 @@ workflow run_wf {
        ]
      },
      toState: { id, output, state -> 
-        state + [input: output.output, layer: "cellbender_corrected"]
+        state + ["input": output.output, "layer": "cellbender_corrected"]
      }
    )
  mid1_uncorrected = input_ch
    | filter{ ! it[1].perform_correction }
  mid1 = mid1_corrected.mix(mid1_uncorrected)
  // perform filtering if so desired
  mid2_filtered = mid1
    | filter{ it[1].min_genes != null || it[1].min_counts != null }
    | filter_with_counts.run(
      runIf: {id, state -> 
        state.min_genes != null || state.min_counts != null
      },
      fromState: { id, state ->
        [
          input: state.input,
@@ -39,16 +36,14 @@ workflow run_wf {
      },
      toState: [input: "output"]
    )
-  mid2_unfiltered = mid1
+    // Make sure to use the correct ouput file names, 
-    | filter{ it[1].min_genes == null && it[1].min_counts == null }
+    // irrespective wether or not any of the above 
-  mid2 = mid2_filtered.mix(mid2_unfiltered)
+    // components were run
  // return output map
  output_ch = mid2
    | publish.run(
      fromState: [ input: "input", output: "output" ],
-      auto: [ publish: true ]
+      toState: ["output": "output"]
    )
    | setState(["output"])
  emit:
  output_ch
--- a/src/workflows/ingestion/cellranger_postprocessing/test.nf
+++ b/src/workflows/ingestion/cellranger_postprocessing/test.nf
@@ -8,11 +8,13 @@ params.resources_test = params.rootDir + "/resources_test"
 workflow test_wf {
  resources_test = file(params.resources_test)
  output_ch = Channel.fromList([
      [
        id: "foo",
-        input: file(params.resources_test).resolve("pbmc_1k_protein_v3/pbmc_1k_protein_v3_filtered_feature_bc_matrix.h5mu"),
+        input: resources_test.resolve("pbmc_1k_protein_v3/pbmc_1k_protein_v3_filtered_feature_bc_matrix.h5mu"),
-        input_og: file(params.resources_test).resolve("pbmc_1k_protein_v3/pbmc_1k_protein_v3_filtered_feature_bc_matrix.h5mu"),
+        input_og: resources_test.resolve("pbmc_1k_protein_v3/pbmc_1k_protein_v3_filtered_feature_bc_matrix.h5mu"),
        perform_correction: true,
        min_genes: 100,
        min_counts: 1000,
@@ -55,9 +57,8 @@ workflow test_wf {
 }
 workflow test_wf2 {
  // allow changing the resources_test dir
  resources_test = file("${params.rootDir}/resources_test")
  resources_test = file(params.resources_test)
  output_ch = Channel.fromList([
      [
--- a/src/workflows/ingestion/conversion/integration_test.sh
+++ b/src/workflows/ingestion/conversion/integration_test.sh
@@ -5,13 +5,10 @@ REPO_ROOT=$(git rev-parse --show-toplevel)
 # ensure that the command below is run from the root of the repository
 cd "$REPO_ROOT"
 export NXF_VER=24.04.4
 nextflow \
  run . \
  -main-script src/workflows/ingestion/conversion/test.nf \
  -entry test_wf \
  -profile docker,no_publish \
  -c src/workflows/utils/labels_ci.config \
-  -c src/workflows/utils/integration_tests.config \
+  -c src/workflows/utils/integration_tests.config
  -with-trace work/trace.txt
--- a/src/workflows/ingestion/conversion/main.nf
+++ b/src/workflows/ingestion/conversion/main.nf
@@ -25,11 +25,9 @@ workflow run_wf {
        }
        passed_state
      },
-      toState: {id, output, state, comp ->
+      toState: ["output": "output"]
        ["output": output.output]
      },
      auto: [publish: true],
    )
    | setState(["output": "output"])
  emit:
  output_ch
--- a/src/workflows/ingestion/conversion/test.nf
+++ b/src/workflows/ingestion/conversion/test.nf
@@ -7,30 +7,32 @@ params.resources_test = params.rootDir + "/resources_test"
 workflow test_wf {
  resources_test = file(params.resources_test)
  output_ch = Channel.fromList([
      [
        id: "10xh5_test",
-        input: file(params.resources_test).resolve("pbmc_1k_protein_v3/pbmc_1k_protein_v3_filtered_feature_bc_matrix.h5"),
+        input: resources_test.resolve("pbmc_1k_protein_v3/pbmc_1k_protein_v3_filtered_feature_bc_matrix.h5"),
        input_type: "10xh5",
        modality: null
      ],
      [
        id: "10xmtx_test",
-        input: file(params.resources_test).resolve("pbmc_1k_protein_v3/pbmc_1k_protein_v3_filtered_feature_bc_matrix"),
+        input: resources_test.resolve("pbmc_1k_protein_v3/pbmc_1k_protein_v3_filtered_feature_bc_matrix"),
        input_type: "10xmtx",
        modality: null,
        output: "\$id.h5mu"
      ],
      [
        id: "10xmtx",
-        input: file(params.resources_test).resolve("pbmc_1k_protein_v3/pbmc_1k_protein_v3_filtered_feature_bc_matrix"),
+        input: resources_test.resolve("pbmc_1k_protein_v3/pbmc_1k_protein_v3_filtered_feature_bc_matrix"),
        input_type: "10xmtx",
        modality: "rna",
        output: "\$key.h5mu"
      ],
      [
        id: "h5ad",
-        input: file(params.resources_test).resolve("pbmc_1k_protein_v3/pbmc_1k_protein_v3_filtered_feature_bc_matrix_rna.h5ad"),
+        input: resources_test.resolve("pbmc_1k_protein_v3/pbmc_1k_protein_v3_filtered_feature_bc_matrix_rna.h5ad"),
        input_type: "h5ad",
        modality: "rna",
        output: "\$key.h5mu"
--- a/src/workflows/ingestion/demux/integration_test.sh
+++ b/src/workflows/ingestion/demux/integration_test.sh
@@ -1,23 +1,15 @@
 #!/bin/bash
 # get the root of the directory
 REPO_ROOT=$(git rev-parse --show-toplevel)
 # ensure that the command below is run from the root of the repository
 cd "$REPO_ROOT"
 export NXF_VER=21.10.6
 viash ns build -q 'workflows/ingestion/demux'
 nextflow \
  run . \
  -main-script src/workflows/ingestion/demux/test.nf \
  -entry test_wf \
  -resume \
  -profile docker,no_publish \
  -with-trace work/trace.txt \
  -c src/workflows/utils/labels_ci.config \
-  -c src/workflows/utils/integration_tests.config
+  -c src/workflows/utils/integration_tests.config
--- a/src/workflows/ingestion/demux/test.nf
+++ b/src/workflows/ingestion/demux/test.nf
@@ -7,24 +7,26 @@ params.resources_test = params.rootDir + "/resources_test"
 workflow test_wf {
  resources_test = file(params.resources_test)
  // or when running from s3:
  Channel.fromList([
    [
      id: "mkfastq_test",
-      input: file(params.resources_test).resolve("cellranger_tiny_bcl/bcl"),
+      input: resources_test.resolve("cellranger_tiny_bcl/bcl"),
-      sample_sheet: file(params.resources_test).resolve("cellranger_tiny_bcl/bcl/sample_sheet.csv"),
+      sample_sheet: resources_test.resolve("cellranger_tiny_bcl/bcl/sample_sheet.csv"),
      demultiplexer: "mkfastq"
    ],
    [
      id: "bclconvert_test",
-      input: file(params.resources_test).resolve("cellranger_tiny_bcl/bcl2/"),
+      input: resources_test.resolve("cellranger_tiny_bcl/bcl2/"),
-      sample_sheet: file(params.resources_test).resolve("cellranger_tiny_bcl/bcl2/sample_sheet.csv"),
+      sample_sheet: resources_test.resolve("cellranger_tiny_bcl/bcl2/sample_sheet.csv"),
      demultiplexer: "bclconvert"
    ],
    [
      id: "bcl2fastq_test",
-      input: file(params.resources_test).resolve("cellranger_tiny_bcl/bcl"),
+      input: resources_test.resolve("cellranger_tiny_bcl/bcl"),
-      sample_sheet: file(params.resources_test).resolve("cellranger_tiny_bcl/bcl/sample_sheet.csv"),
+      sample_sheet: resources_test.resolve("cellranger_tiny_bcl/bcl/sample_sheet.csv"),
      demultiplexer: "bcl2fastq",
      ignore_missing: true
    ]
--- a/src/workflows/ingestion/make_reference/integration_test.sh
+++ b/src/workflows/ingestion/make_reference/integration_test.sh
@@ -1,20 +1,15 @@
 #!/bin/bash
 # get the root of the directory
 REPO_ROOT=$(git rev-parse --show-toplevel)
 # ensure that the command below is run from the root of the repository
 cd "$REPO_ROOT"
 export NXF_VER=23.04.2
 nextflow \
  run . \
  -main-script src/workflows/ingestion/make_reference/test.nf \
  -entry test_wf \
  -profile docker,no_publish \
  -c src/workflows/utils/labels_ci.config \
-  -c src/workflows/utils/integration_tests.config \
+  -c src/workflows/utils/integration_tests.config
  -resume
--- a/src/workflows/ingestion/make_reference/test.nf
+++ b/src/workflows/ingestion/make_reference/test.nf
@@ -6,12 +6,14 @@ params.resources_test = params.rootDir + "/resources_test"
 workflow test_wf {
  resources_test = file(params.resources_test)
  output_ch = Channel.fromList([
      [
        id: "gencode_v41_ercc",
-        genome_fasta: file(params.resources_test).resolve("reference_gencodev41_chr1/reference.fa.gz"),
+        genome_fasta: resources_test.resolve("reference_gencodev41_chr1/reference.fa.gz"),
-        transcriptome_gtf: file(params.resources_test).resolve("reference_gencodev41_chr1/reference.gtf.gz"),
+        transcriptome_gtf: resources_test.resolve("reference_gencodev41_chr1/reference.gtf.gz"),
-        ercc: file(params.resources_test).resolve("reference_gencodev41_chr1/ERCC92.zip"),
+        ercc: resources_test.resolve("reference_gencodev41_chr1/ERCC92.zip"),
        subset_regex: "(ERCC-00002|chr1)",
        target: ["cellranger", "bd_rhapsody", "star"]
      ]        
--- a/src/workflows/integration/bbknn_leiden/integration_test.sh
+++ b/src/workflows/integration/bbknn_leiden/integration_test.sh
@@ -1,25 +1,23 @@
 #!/bin/bash
 # get the root of the directory
 REPO_ROOT=$(git rev-parse --show-toplevel)
 # ensure that the command below is run from the root of the repository
 cd "$REPO_ROOT"
-export NXF_VER=21.10.6
+nextflow \
-
+  run . \
 nextflow run . \
  -main-script src/workflows/integration/bbknn_leiden/test.nf \
  -profile docker,no_publish \
  -entry test_wf \
  -profile docker,no_publish \
  -c src/workflows/utils/labels_ci.config \
  -c src/workflows/utils/integration_tests.config
-nextflow run . \
+nextflow \
  run . \
  -main-script src/workflows/integration/bbknn_leiden/test.nf \
  -profile docker,no_publish \
  -entry test_wf2 \
  -profile docker,no_publish \
  -c src/workflows/utils/labels_ci.config \
-  -c src/workflows/utils/integration_tests.config
+  -c src/workflows/utils/integration_tests.config
--- a/src/workflows/integration/bbknn_leiden/main.nf
+++ b/src/workflows/integration/bbknn_leiden/main.nf
@@ -86,11 +86,9 @@ workflow run_wf {
          "output_compression": "gzip"
       ]
      },
-      toState: { id, output, state -> 
+      toState: ["output": "output"]
        [ output: output.output ]
      },
      auto: [publish: true]
    )
    | setState(["output"])
  emit:
  output_ch
--- a/src/workflows/integration/bbknn_leiden/test.nf
+++ b/src/workflows/integration/bbknn_leiden/test.nf
@@ -5,16 +5,19 @@ include { bbknn_leiden } from params.rootDir + "/target/nextflow/workflows/integ
 params.resources_test = params.rootDir + "/resources_test"
 workflow test_wf {
  resources_test = file(params.resources_test)
  output_ch =
    Channel.fromList([
      [
        id: "simple_execution_test",
-        input: file(params.resources_test).resolve("pbmc_1k_protein_v3/pbmc_1k_protein_v3_mms.h5mu"),
+        input: resources_test.resolve("pbmc_1k_protein_v3/pbmc_1k_protein_v3_mms.h5mu"),
        layer: "log_normalized"
      ],
      [
       id: "no_leiden_resolutions_test",
-       input: file(params.resources_test).resolve("pbmc_1k_protein_v3/pbmc_1k_protein_v3_mms.h5mu"),
+       input: resources_test.resolve("pbmc_1k_protein_v3/pbmc_1k_protein_v3_mms.h5mu"),
       layer: "log_normalized",
       leiden_resolution: []
      ]
@@ -45,7 +48,9 @@ workflow test_wf {
 }
 workflow test_wf2 {
-  resources_test = file("${params.rootDir}/resources_test")
+
  resources_test = file(params.resources_test)
  output_ch =
    Channel.fromList([
      [
--- a/src/workflows/integration/harmony_leiden/integration_test.sh
+++ b/src/workflows/integration/harmony_leiden/integration_test.sh
@@ -1,27 +1,23 @@
 #!/bin/bash
 # get the root of the directory
 REPO_ROOT=$(git rev-parse --show-toplevel)
 # ensure that the command below is run from the root of the repository
 cd "$REPO_ROOT"
-export NXF_VER=21.10.6
+nextflow \
-
+  run . \
 nextflow run . \
  -main-script src/workflows/integration/harmony_leiden/test.nf \
  -profile docker,no_publish \
  -entry test_wf \
-  -resume \
+  -profile docker,no_publish \
  -c src/workflows/utils/labels_ci.config \
  -c src/workflows/utils/integration_tests.config
-nextflow run . \
+nextflow \
  run . \
  -main-script src/workflows/integration/harmony_leiden/test.nf \
  -profile docker,no_publish \
  -entry test_wf2 \
-  -resume \
+  -profile docker,no_publish \
  -c src/workflows/utils/labels_ci.config \
  -c src/workflows/utils/integration_tests.config
--- a/src/workflows/integration/harmony_leiden/main.nf
+++ b/src/workflows/integration/harmony_leiden/main.nf
@@ -77,11 +77,9 @@ workflow run_wf {
          "output_compression": "gzip"
        ]
      },
-      toState: { id, output, state ->
+      toState: ["output": "output"]
        [ output: output.output ]
      },
      auto: [ publish: true ]
    )
    | setState(["output"])
  emit:
  output_ch
--- a/src/workflows/integration/harmony_leiden/test.nf
+++ b/src/workflows/integration/harmony_leiden/test.nf
@@ -6,11 +6,13 @@ params.resources_test = params.rootDir + "/resources_test"
 workflow test_wf {
  resources_test = file(params.resources_test)
  output_ch = 
    Channel.fromList([
      [
        id: "simple_execution_test",
-        input: file(params.resources_test).resolve("pbmc_1k_protein_v3/pbmc_1k_protein_v3_mms.h5mu"),
+        input: resources_test.resolve("pbmc_1k_protein_v3/pbmc_1k_protein_v3_mms.h5mu"),
        layer: "log_normalized",
        obs_covariates: "sample_id",
        embedding: "X_pca",
@@ -19,7 +21,7 @@ workflow test_wf {
      ],
      [
        id: "no_leiden_resolutions_test",
-        input: file(params.resources_test).resolve("pbmc_1k_protein_v3/pbmc_1k_protein_v3_mms.h5mu"),
+        input: resources_test.resolve("pbmc_1k_protein_v3/pbmc_1k_protein_v3_mms.h5mu"),
        layer: "log_normalized",
        obs_covariates: "sample_id",
        embedding: "X_pca",
@@ -53,7 +55,8 @@ workflow test_wf {
 workflow test_wf2 {
-  resources_test = file("${params.rootDir}/resources_test")
+
  resources_test = file(params.resources_test)
  output_ch = 
    Channel.fromList([
--- a/src/workflows/integration/scanorama_leiden/integration_test.sh
+++ b/src/workflows/integration/scanorama_leiden/integration_test.sh
@@ -6,20 +6,18 @@ REPO_ROOT=$(git rev-parse --show-toplevel)
 # ensure that the command below is run from the root of the repository
 cd "$REPO_ROOT"
-export NXF_VER=21.10.6
+nextflow \
-
+  run . \
 nextflow run . \
  -main-script src/workflows/integration/scanorama_leiden/test.nf \
  -profile docker,no_publish \
  -entry test_wf \
  -resume \
  -c src/workflows/utils/labels_ci.config \
  -c src/workflows/utils/integration_tests.config \
 nextflow run . \
  -main-script src/workflows/integration/scanorama_leiden/test.nf \
  -profile docker,no_publish \
  -entry test_wf2 \
  -resume \
  -c src/workflows/utils/labels_ci.config \
-  -c src/workflows/utils/integration_tests.config \
+  -c src/workflows/utils/integration_tests.config
 nextflow \
  run . \
  -main-script src/workflows/integration/scanorama_leiden/test.nf \
  -entry test_wf2 \
  -profile docker,no_publish \
  -c src/workflows/utils/labels_ci.config \
  -c src/workflows/utils/integration_tests.config
--- a/src/workflows/integration/scanorama_leiden/main.nf
+++ b/src/workflows/integration/scanorama_leiden/main.nf
@@ -74,11 +74,9 @@ workflow run_wf {
          "output_compression": "gzip"
        ]
      },
-      auto: [ publish: true ],
+      toState: ["output": "output"]
      toState: { id, output, state ->
        [ output: output.output ]
      }
    )
    | setState(["output"])
  emit:
  output_ch
--- a/src/workflows/integration/scanorama_leiden/test.nf
+++ b/src/workflows/integration/scanorama_leiden/test.nf
@@ -6,16 +6,18 @@ params.resources_test = params.rootDir + "/resources_test"
 workflow test_wf {
  resources_test = file(params.resources_test)
  output_ch = Channel.fromList([
      [
        id: "simple_execution_test",
-        input: file(params.resources_test).resolve("pbmc_1k_protein_v3/pbmc_1k_protein_v3_mms.h5mu"),
+        input: resources_test.resolve("pbmc_1k_protein_v3/pbmc_1k_protein_v3_mms.h5mu"),
        layer: "log_normalized",
        leiden_resolution: [1.0, 0.25],
      ],
      [
        id: "no_leiden_resolutions_test",
-        input: file(params.resources_test).resolve("pbmc_1k_protein_v3/pbmc_1k_protein_v3_mms.h5mu"),
+        input: resources_test.resolve("pbmc_1k_protein_v3/pbmc_1k_protein_v3_mms.h5mu"),
        layer: "log_normalized",
        leiden_resolution: [],
      ]
@@ -46,8 +48,8 @@ workflow test_wf {
 }
 workflow test_wf2 {
-  // allow changing the resources_test dir
+
-  resources_test = file("${params.rootDir}/resources_test")
+  resources_test = file(params.resources_test)
  output_ch = Channel.fromList([
      [
--- a/src/workflows/integration/scgpt_leiden/integration_test.sh
+++ b/src/workflows/integration/scgpt_leiden/integration_test.sh
@@ -1,29 +1,23 @@
 #!/bin/bash
 # get the root of the directory
 REPO_ROOT=$(git rev-parse --show-toplevel)
 # ensure that the command below is run from the root of the repository
 cd "$REPO_ROOT"
-export NXF_VER=21.10.6
+nextflow \
-
+  run . \
 viash ns build -q scgpt_leiden
 nextflow run . \
  -main-script src/workflows/integration/scgpt_leiden/test.nf \
  -profile docker,no_publish \
  -entry test_wf \
-  -resume \
+  -profile docker,no_publish \
  -c src/workflows/utils/labels_ci.config \
  -c src/workflows/utils/integration_tests.config
-nextflow run . \
+nextflow \
  run . \
  -main-script src/workflows/integration/scgpt_leiden/test.nf \
  -profile docker,no_publish \
  -entry test_wf2 \
-  -resume \
+  -profile docker,no_publish \
  -c src/workflows/utils/labels_ci.config \
-  -c src/workflows/utils/integration_tests.config \
+  -c src/workflows/utils/integration_tests.config
--- a/src/workflows/integration/scgpt_leiden/main.nf
+++ b/src/workflows/integration/scgpt_leiden/main.nf
@@ -152,11 +152,9 @@ workflow run_wf {
          "output": state.workflow_output
        ]
      },
-      toState: { id, output, state ->
+      toState: ["output": "output"]
        [ output: output.output ]
      },
      auto: [ publish: true ]
    )
    | setState(["output"])
  emit:
    output_ch
--- a/src/workflows/integration/scgpt_leiden/test.nf
+++ b/src/workflows/integration/scgpt_leiden/test.nf
@@ -6,30 +6,32 @@ params.resources_test = params.rootDir + "/resources_test"
 workflow test_wf {
  resources_test = file(params.resources_test)
    output_ch = Channel.fromList([
        [
-            id: "simple_execution_test",
+          id: "simple_execution_test",
-            input: file(params.resources_test).resolve("scgpt/test_resources/Kim2020_Lung_subset_preprocessed.h5mu"),
+          input: resources_test.resolve("scgpt/test_resources/Kim2020_Lung_subset_preprocessed.h5mu"),
-            model: file(params.resources_test).resolve("scgpt/source/best_model.pt"),
+          model: resources_test.resolve("scgpt/source/best_model.pt"),
-            model_config: file(params.resources_test).resolve("scgpt/source/args.json"),
+          model_config: resources_test.resolve("scgpt/source/args.json"),
-            model_vocab: file(params.resources_test).resolve("scgpt/source/vocab.json"),
+          model_vocab: resources_test.resolve("scgpt/source/vocab.json"),
-            input_layer: "log_normalized",
+          input_layer: "log_normalized",
-            obs_batch_label: "sample",
+          obs_batch_label: "sample",
-            n_hvg: 400,
+          n_hvg: 400,
-            seed: 1,
+          seed: 1,
-            leiden_resolution: [1.0, 0.25]
+          leiden_resolution: [1.0, 0.25]
        ],
        [
-            id: "no_leiden_resolutions_test",
+          id: "no_leiden_resolutions_test",
-            input: file(params.resources_test).resolve("scgpt/test_resources/Kim2020_Lung_subset_preprocessed.h5mu"),
+          input: resources_test.resolve("scgpt/test_resources/Kim2020_Lung_subset_preprocessed.h5mu"),
-            model: file(params.resources_test).resolve("scgpt/source/best_model.pt"),
+          model: resources_test.resolve("scgpt/source/best_model.pt"),
-            model_config: file(params.resources_test).resolve("scgpt/source/args.json"),
+          model_config: resources_test.resolve("scgpt/source/args.json"),
-            model_vocab: file(params.resources_test).resolve("scgpt/source/vocab.json"),
+          model_vocab: resources_test.resolve("scgpt/source/vocab.json"),
-            obs_batch_label: "sample",
+          obs_batch_label: "sample",
-            n_hvg: 400,
+          n_hvg: 400,
-            seed: 1,
+          seed: 1,
-            input_layer: "log_normalized",
+          input_layer: "log_normalized",
-            leiden_resolution: []
+          leiden_resolution: []
        ]
    ])
    | map{ state -> [state.id, state] }
@@ -59,7 +61,8 @@ workflow test_wf {
 workflow test_wf2 {
-  resources_test = file("${params.rootDir}/resources_test/scgpt")
+
  resources_test = file(params.resources_test)
  output_ch = Channel.fromList([
      [
--- a/src/workflows/integration/scvi_leiden/integration_test.sh
+++ b/src/workflows/integration/scvi_leiden/integration_test.sh
@@ -1,19 +1,15 @@
 #!/bin/bash
 # get the root of the directory
 REPO_ROOT=$(git rev-parse --show-toplevel)
 # ensure that the command below is run from the root of the repository
 cd "$REPO_ROOT"
-export NXF_VER=21.10.6
+nextflow \
-
+  run . \
 nextflow run . \
  -main-script src/workflows/integration/scvi_leiden/test.nf \
  -profile docker,no_publish \
  -entry test_wf \
  -profile docker,no_publish \
  -c src/workflows/utils/labels_ci.config \
-  -c src/workflows/utils/integration_tests.config \
+  -c src/workflows/utils/integration_tests.config
  -resume
--- a/src/workflows/integration/scvi_leiden/main.nf
+++ b/src/workflows/integration/scvi_leiden/main.nf
@@ -98,14 +98,9 @@ workflow run_wf {
        "output_compression": "gzip"
        ]
      },
-      auto: [ publish: true ],
+      toState: ["output": "output"] 
      toState: { id, output, state ->
        [ 
          output: output.output, 
          output_model: state.output_model
        ]
      }
    )
    | setState(["output", "output_model"])
  emit:
  output_ch
--- a/src/workflows/integration/scvi_leiden/test.nf
+++ b/src/workflows/integration/scvi_leiden/test.nf
@@ -6,10 +6,12 @@ params.resources_test = params.rootDir + "/resources_test"
 workflow test_wf {
  resources_test = file(params.resources_test)
  output_ch = Channel.fromList([
      [
        id: "simple_execution_test",
-        input: file(params.resources_test).resolve("pbmc_1k_protein_v3/pbmc_1k_protein_v3_mms.h5mu"),
+        input: resources_test.resolve("pbmc_1k_protein_v3/pbmc_1k_protein_v3_mms.h5mu"),
        layer: "log_normalized",
        obs_batch: "sample_id",
        max_epochs: 1,
@@ -17,7 +19,7 @@ workflow test_wf {
      ],
      [
        id: "no_leiden_resolutions_test",
-        input: file(params.resources_test).resolve("pbmc_1k_protein_v3/pbmc_1k_protein_v3_mms.h5mu"),
+        input: resources_test.resolve("pbmc_1k_protein_v3/pbmc_1k_protein_v3_mms.h5mu"),
        layer: "log_normalized",
        obs_batch: "sample_id",
        output_model: "no_leiden_resolutions_test_model/",
--- a/src/workflows/integration/totalvi_leiden/integration_test.sh
+++ b/src/workflows/integration/totalvi_leiden/integration_test.sh
@@ -1,18 +1,15 @@
 #!/bin/bash
 # get the root of the directory
 REPO_ROOT=$(git rev-parse --show-toplevel)
 # ensure that the command below is run from the root of the repository
 cd "$REPO_ROOT"
-export NXF_VER=23.04.2
+nextflow \
-
+  run . \
 nextflow run . \
  -main-script src/workflows/integration/totalvi_leiden/test.nf \
  -profile docker,no_publish \
  -entry test_wf \
  -profile docker,no_publish \
  -c src/workflows/utils/labels_ci.config \
  -c src/workflows/utils/integration_tests.config
--- a/src/workflows/integration/totalvi_leiden/main.nf
+++ b/src/workflows/integration/totalvi_leiden/main.nf
@@ -139,15 +139,9 @@ workflow run_wf {
          "compression": "gzip"
        ]
      },
-      toState: { id, output, state ->
+      toState: ["output", "output"]
        [ 
          output: output.output, 
          reference_model_path: state.reference_model_path,
          query_model_path: state.query_model_path
        ]
      },
      auto: [ publish: true ]
    )
    | setState(["output", "reference_model_path", "query_model_path"])
  emit:
  output_ch
 }
--- a/src/workflows/integration/totalvi_leiden/test.nf
+++ b/src/workflows/integration/totalvi_leiden/test.nf
@@ -6,11 +6,13 @@ params.resources_test = params.rootDir + "/resources_test"
 workflow test_wf {
  resources_test = file(params.resources_test)
    output_ch = Channel.fromList([
      [
        id: "simple_execution_test",
-        input: file(params.resources_test).resolve("pbmc_1k_protein_v3/pbmc_1k_protein_v3_mms.h5mu"),
+        input: resources_test.resolve("pbmc_1k_protein_v3/pbmc_1k_protein_v3_mms.h5mu"),
-        reference: file(params.resources_test).resolve("pbmc_1k_protein_v3/pbmc_1k_protein_v3_mms.h5mu"),
+        reference: resources_test.resolve("pbmc_1k_protein_v3/pbmc_1k_protein_v3_mms.h5mu"),
        prot_modality: "prot",
        prot_reference_modality: "prot",
        var_input: "filter_with_hvg",
@@ -21,8 +23,8 @@ workflow test_wf {
      ],
      [
        id: "no_prot_leiden_resolutions_test",
-        input: file(params.resources_test).resolve("pbmc_1k_protein_v3/pbmc_1k_protein_v3_mms.h5mu"),
+        input: resources_test.resolve("pbmc_1k_protein_v3/pbmc_1k_protein_v3_mms.h5mu"),
-        reference: file(params.resources_test).resolve("pbmc_1k_protein_v3/pbmc_1k_protein_v3_mms.h5mu"),
+        reference: resources_test.resolve("pbmc_1k_protein_v3/pbmc_1k_protein_v3_mms.h5mu"),
        prot_modality: "prot",
        prot_reference_modality: "prot",
        var_input: "filter_with_hvg",
@@ -34,8 +36,8 @@ workflow test_wf {
      ],
      [
        id: "no_rna_leiden_resolutions_test",
-        input: file(params.resources_test).resolve("pbmc_1k_protein_v3/pbmc_1k_protein_v3_mms.h5mu"),
+        input: resources_test.resolve("pbmc_1k_protein_v3/pbmc_1k_protein_v3_mms.h5mu"),
-        reference: file(params.resources_test).resolve("pbmc_1k_protein_v3/pbmc_1k_protein_v3_mms.h5mu"),
+        reference: resources_test.resolve("pbmc_1k_protein_v3/pbmc_1k_protein_v3_mms.h5mu"),
        prot_modality: "prot",
        prot_reference_modality: "prot",
        var_input: "filter_with_hvg",
--- a/src/workflows/multiomics/dimensionality_reduction/integration_test.sh
+++ b/src/workflows/multiomics/dimensionality_reduction/integration_test.sh
@@ -1,19 +1,15 @@
 #!/bin/bash
 # get the root of the directory
 REPO_ROOT=$(git rev-parse --show-toplevel)
 # ensure that the command below is run from the root of the repository
 cd "$REPO_ROOT"
-export NXF_VER=21.10.6
+nextflow \
-
+  run . \
 nextflow run . \
  -main-script src/workflows/multiomics/dimensionality_reduction/test.nf \
  -profile docker,no_publish \
  -entry test_wf \
  -profile docker,no_publish \
  -c src/workflows/utils/labels_ci.config \
-  -c src/workflows/utils/integration_tests.config \
+  -c src/workflows/utils/integration_tests.config
  -resume
--- a/src/workflows/multiomics/dimensionality_reduction/main.nf
+++ b/src/workflows/multiomics/dimensionality_reduction/main.nf
@@ -44,11 +44,9 @@ workflow run_wf {
          "output_compression": "gzip"
        ]
      },
-      toState: { id, output, state ->
+      toState: ["output": "output"]
        [ output: output.output ]
      },
      auto: [ publish: true ]
    )
    | setState(["output"])
  emit:
  output_ch
--- a/src/workflows/multiomics/dimensionality_reduction/test.nf
+++ b/src/workflows/multiomics/dimensionality_reduction/test.nf
@@ -6,18 +6,19 @@ include { dimensionality_reduction_test } from params.rootDir + "/target/nextflo
 params.resources_test = params.rootDir + "/resources_test"
 workflow test_wf {
-  // allow changing the resources_test dir
+
  resources_test = file(params.resources_test)
  input_ch = Channel.fromList([
      [
        id: "simple_execution_test",
-        input: file(params.resources_test).resolve("concat_test_data/concatenated_brain_filtered_feature_bc_matrix_subset.h5mu"),
+        input: resources_test.resolve("concat_test_data/concatenated_brain_filtered_feature_bc_matrix_subset.h5mu"),
        layer: "",
        output: "foo.final.h5mu"
      ],
      [
        id: "pca_obsm_output_test",
-        input: file(params.resources_test).resolve("concat_test_data/concatenated_brain_filtered_feature_bc_matrix_subset.h5mu"),
+        input: resources_test.resolve("concat_test_data/concatenated_brain_filtered_feature_bc_matrix_subset.h5mu"),
        layer: "",
        output: "foo.final.h5mu"
      ],
--- a/src/workflows/multiomics/process_batches/integration_test.sh
+++ b/src/workflows/multiomics/process_batches/integration_test.sh
@@ -1,26 +1,18 @@
 #!/bin/bash
 # get the root of the directory
 REPO_ROOT=$(git rev-parse --show-toplevel)
 # ensure that the command below is run from the root of the repository
 cd "$REPO_ROOT"
 viash ns build -q process_batches
 export NXF_VER=24.04.4
 nextflow \
  run . \
  -main-script src/workflows/multiomics/process_batches/test.nf \
  -entry test_wf \
  -profile docker,no_publish \
  -c src/workflows/utils/labels_ci.config \
-  -c src/workflows/utils/integration_tests.config \
+  -c src/workflows/utils/integration_tests.config
  -resume
 nextflow \
  run . \
@@ -28,5 +20,4 @@ nextflow \
  -entry test_wf2 \
  -profile docker,no_publish \
  -c src/workflows/utils/labels_ci.config \
-  -c src/workflows/utils/integration_tests.config \
+  -c src/workflows/utils/integration_tests.config
  -resume
--- a/src/workflows/multiomics/process_batches/main.nf
+++ b/src/workflows/multiomics/process_batches/main.nf
@@ -215,7 +215,7 @@ workflow run_wf {
            "output": state.workflow_output,
          ]
        },
-        auto: [publish: true]
+        toState: ["output": "output"]
      )
      | setState(["output"])
--- a/src/workflows/multiomics/process_batches/test.nf
+++ b/src/workflows/multiomics/process_batches/test.nf
@@ -9,16 +9,18 @@ params.resources_test = params.rootDir + "/resources_test"
 workflow test_wf {
  resources_test = file(params.resources_test)
  input_ch = Channel.fromList([
      [
          id: "test",
-          input: file(params.resources_test).resolve("concat_test_data/concatenated_brain_filtered_feature_bc_matrix_subset.h5mu"),
+          input: resources_test.resolve("concat_test_data/concatenated_brain_filtered_feature_bc_matrix_subset.h5mu"),
          publish_dir: "foo/",
          clr_axis: 0
      ],
      [
          id: "test2",
-          input: file(params.resources_test).resolve("concat_test_data/concatenated_brain_filtered_feature_bc_matrix_subset.h5mu"),
+          input: resources_test.resolve("concat_test_data/concatenated_brain_filtered_feature_bc_matrix_subset.h5mu"),
          publish_dir: "foo/",
          clr_axis: 1
      ]
@@ -52,8 +54,8 @@ workflow test_wf {
 }
 workflow test_wf2 {
-  // allow changing the resources_test dir
+
-  resources_test = file("${params.rootDir}/resources_test")
+  resources_test = file(params.resources_test)
  input_ch = Channel.fromList([
      [
--- a/src/workflows/multiomics/process_samples/integration_test.sh
+++ b/src/workflows/multiomics/process_samples/integration_test.sh
@@ -8,15 +8,10 @@ REPO_ROOT=$(git rev-parse --show-toplevel)
 # ensure that the command below is run from the root of the repository
 cd "$REPO_ROOT"
 export NXF_VER=23.10.3
 viash ns build -q '^workflows'
 nextflow \
  run . \
  -main-script src/workflows/multiomics/process_samples/test.nf \
  -entry test_wf \
  -resume \
  -profile docker,no_publish \
  -c src/workflows/utils/labels_ci.config \
  -c src/workflows/utils/integration_tests.config
@@ -26,7 +21,6 @@ nextflow \
  run . \
  -main-script src/workflows/multiomics/process_samples/test.nf \
  -entry test_wf \
  -resume \
  -profile docker,no_publish \
  -c src/workflows/utils/labels_ci.config \
  -c src/workflows/utils/integration_tests.config \
@@ -37,7 +31,6 @@ nextflow \
  run . \
  -main-script src/workflows/multiomics/process_samples/test.nf \
  -entry test_wf \
  -resume \
  -profile docker,no_publish \
  -c src/workflows/utils/labels_ci.config \
  -c src/workflows/utils/integration_tests.config \
@@ -48,7 +41,6 @@ nextflow \
  run . \
  -main-script src/workflows/multiomics/process_samples/test.nf \
  -entry test_wf \
  -resume \
  -profile docker,no_publish \
  -c src/workflows/utils/labels_ci.config \
  -c src/workflows/utils/integration_tests.config \
@@ -57,9 +49,8 @@ nextflow \
 nextflow \
  run . \
  -main-script src/workflows/multiomics/process_samples/test.nf \
  -entry test_wf2 \
  -resume \
  -profile docker,no_publish \
  -entry test_wf2 \
  -c src/workflows/utils/labels_ci.config \
  -c src/workflows/utils/integration_tests.config
@@ -67,7 +58,6 @@ nextflow \
  run . \
  -main-script src/workflows/multiomics/process_samples/test.nf \
  -entry test_wf3 \
  -resume \
  -profile docker,no_publish \
  -c src/workflows/utils/labels_ci.config \
  -c src/workflows/utils/integration_tests.config
@@ -76,7 +66,6 @@ nextflow \
  run . \
  -main-script src/workflows/multiomics/process_samples/test.nf \
  -entry test_wf4 \
  -resume \
  -profile docker,no_publish \
  -c src/workflows/utils/labels_ci.config \
  -c src/workflows/utils/integration_tests.config
@@ -85,7 +74,6 @@ nextflow \
  run . \
  -main-script src/workflows/multiomics/process_samples/test.nf \
  -entry test_wf5 \
  -resume \
  -profile docker,no_publish \
  -c src/workflows/utils/labels_ci.config \
  -c src/workflows/utils/integration_tests.config
@@ -94,7 +82,6 @@ nextflow \
  run . \
  -main-script src/workflows/multiomics/process_samples/test.nf \
  -entry test_wf6 \
  -resume \
  -profile docker,no_publish \
  -c src/workflows/utils/labels_ci.config \
  -c src/workflows/utils/integration_tests.config
@@ -103,7 +90,6 @@ nextflow \
  run . \
  -main-script src/workflows/multiomics/process_samples/test.nf \
  -entry test_wf7 \
  -resume \
  -profile docker,no_publish \
  -c src/workflows/utils/labels_ci.config \
-  -c src/workflows/utils/integration_tests.config
+  -c src/workflows/utils/integration_tests.config
--- a/src/workflows/multiomics/process_samples/test.nf
+++ b/src/workflows/multiomics/process_samples/test.nf
@@ -9,17 +9,19 @@ params.resources_test = params.rootDir + "/resources_test"
 workflow test_wf {
  resources_test = file(params.resources_test)
  output_ch = Channel.fromList([
    [
      id: "mouse",
-      input: file(params.resources_test).resolve("concat_test_data/e18_mouse_brain_fresh_5k_filtered_feature_bc_matrix_subset_unique_obs.h5mu"),
+      input: resources_test.resolve("concat_test_data/e18_mouse_brain_fresh_5k_filtered_feature_bc_matrix_subset_unique_obs.h5mu"),
      publish_dir: "foo/",
      rna_min_counts: 2,
      output: "test.h5mu",
    ],
    [
      id: "human",
-      input: file(params.resources_test).resolve("concat_test_data/human_brain_3k_filtered_feature_bc_matrix_subset_unique_obs.h5mu"),
+      input: resources_test.resolve("concat_test_data/human_brain_3k_filtered_feature_bc_matrix_subset_unique_obs.h5mu"),
      publish_dir: "foo/",
      rna_min_counts: 2,
      output: "test.h5mu",
@@ -42,8 +44,8 @@ workflow test_wf {
 }
 workflow test_wf2 {
-  // allow changing the resources_test dir
+
-  resources_test = file("${params.rootDir}/resources_test")
+  resources_test = file(params.resources_test)
  output_ch = Channel.fromList([
    [
@@ -94,8 +96,8 @@ workflow test_wf2 {
 }
 workflow test_wf3 {
-  // allow changing the resources_test dir
+
-  resources_test = file("${params.rootDir}/resources_test")
+  resources_test = file(params.resources_test)
  input_ch = Channel.fromList([
      [
@@ -156,8 +158,8 @@ workflow test_wf3 {
 }
 workflow test_wf4 {
-  // allow changing the resources_test dir
+
-  resources_test = file("${params.rootDir}/resources_test")
+  resources_test = file(params.resources_test)
  output_ch = Channel.fromList([
    [
@@ -187,8 +189,8 @@ workflow test_wf4 {
 }
 workflow test_wf5 {
-  // allow changing the resources_test dir
+
-  resources_test = file("${params.rootDir}/resources_test")
+  resources_test = file(params.resources_test)
  output_ch = Channel.fromList([
    [
@@ -235,8 +237,8 @@ workflow test_wf5 {
 }
 workflow test_wf6 {
-  // allow changing the resources_test dir
+
-  resources_test = file("${params.rootDir}/resources_test")
+  resources_test = file(params.resources_test)
  output_ch = Channel.fromList([
    [
@@ -298,8 +300,8 @@ workflow test_wf6 {
 // }
 workflow test_wf7 {
-  // allow changing the resources_test dir
+
-  resources_test = file("${params.rootDir}/resources_test")
+  resources_test = file(params.resources_test)
  output_ch = Channel.fromList([
    [
--- a/src/workflows/multiomics/split_modalities/integration_test.sh
+++ b/src/workflows/multiomics/split_modalities/integration_test.sh
@@ -4,12 +4,10 @@ REPO_ROOT=$(git rev-parse --show-toplevel)
 # ensure that the command below is run from the root of the repository
 cd "$REPO_ROOT"
-
+nextflow \
-
+  run . \
-nextflow run . \
+  -main-script src/workflows/multiomics/split_modalities/test.nf \
-    -main-script src/workflows/multiomics/split_modalities/test.nf \
+  -entry test_wf \
-    -entry test_wf \
+  -profile docker,no_publish \
-    -resume \
+  -c src/workflows/utils/labels_ci.config \
-    -profile docker \
+  -c src/workflows/utils/integration_tests.config
    -c src/workflows/utils/labels_ci.config \
    -c src/workflows/utils/integration_tests.config
--- a/src/workflows/multiomics/split_modalities/test.nf
+++ b/src/workflows/multiomics/split_modalities/test.nf
@@ -7,10 +7,12 @@ params.resources_test = params.rootDir + "/resources_test"
 workflow test_wf {
  resources_test = file(params.resources_test)
  output_ch = Channel.fromList([
    [
      id: "mouse",
-      input: file(params.resources_test).resolve("pbmc_1k_protein_v3/pbmc_1k_protein_v3_filtered_feature_bc_matrix.h5mu"),
+      input: resources_test.resolve("pbmc_1k_protein_v3/pbmc_1k_protein_v3_filtered_feature_bc_matrix.h5mu"),
      publish_dir: "foo/",
      output: "modalities",
      output_types: "types.csv"
--- a/src/workflows/prot/prot_multisample/integration_test.sh
+++ b/src/workflows/prot/prot_multisample/integration_test.sh
@@ -6,12 +6,10 @@ REPO_ROOT=$(git rev-parse --show-toplevel)
 # ensure that the command below is run from the root of the repository
 cd "$REPO_ROOT"
-viash ns build -q prot_multisample
+nextflow \
-
+  run . \
 nextflow run . \
  -main-script src/workflows/prot/prot_multisample/test.nf \
  -profile docker,no_publish \
  -entry test_wf \
-  -with-trace work/trace.txt \
+  -profile docker,no_publish \
  -c src/workflows/utils/labels_ci.config \
-  -c src/workflows/utils/integration_tests.config
+  -c src/workflows/utils/integration_tests.config
--- a/src/workflows/prot/prot_multisample/test.nf
+++ b/src/workflows/prot/prot_multisample/test.nf
@@ -6,17 +6,19 @@ params.resources_test = params.rootDir + "/resources_test"
 workflow test_wf {
  resources_test = file(params.resources_test)
  output_ch = Channel.fromList([
      [
        id: "adt_samples_axis_0",
        sample_id: "pbmc",
-        input: file(params.resources_test).resolve("pbmc_1k_protein_v3/pbmc_1k_protein_v3_mms.h5mu"),
+        input: resources_test.resolve("pbmc_1k_protein_v3/pbmc_1k_protein_v3_mms.h5mu"),
        clr_axis: 0
      ],
      [
        id: "adt_samples_axis_1",
        sample_id: "pbmc",
-        input: file(params.resources_test).resolve("pbmc_1k_protein_v3/pbmc_1k_protein_v3_mms.h5mu"),
+        input: resources_test.resolve("pbmc_1k_protein_v3/pbmc_1k_protein_v3_mms.h5mu"),
        clr_axis: 1
      ]
    ])
--- a/src/workflows/prot/prot_singlesample/integration_test.sh
+++ b/src/workflows/prot/prot_singlesample/integration_test.sh
@@ -6,15 +6,10 @@ REPO_ROOT=$(git rev-parse --show-toplevel)
 # ensure that the command below is run from the root of the repository
 cd "$REPO_ROOT"
-export NXF_VER=21.10.6
+nextflow \
-
+  run . \
 viash ns build -q prot_singlesample
 nextflow run . \
  -main-script src/workflows/multiomics/prot_singlesample/test.nf \
  -profile docker,no_publish \
  -resume \
  -entry test_wf \
-  -with-trace work/trace.txt \
+  -profile docker,no_publish \
  -c src/workflows/utils/labels_ci.config \
-  -c src/workflows/utils/integration_tests.config
+  -c src/workflows/utils/integration_tests.config
--- a/src/workflows/prot/prot_singlesample/main.nf
+++ b/src/workflows/prot/prot_singlesample/main.nf
@@ -44,7 +44,6 @@ workflow run_wf {
        return newState
      },
      toState: ["output": "output"],
      auto: [ publish: true ]
    )
    | setState(["output"])
--- a/src/workflows/prot/prot_singlesample/test.nf
+++ b/src/workflows/prot/prot_singlesample/test.nf
@@ -6,10 +6,12 @@ params.resources_test = params.rootDir + "/resources_test"
 workflow test_wf {
  resources_test = file(params.resources_test)
  output_ch = Channel.fromList([
      [
        id: "foo",
-        input: file(params.resources_test).resolve("pbmc_1k_protein_v3/pbmc_1k_protein_v3_filtered_feature_bc_matrix.h5mu"),
+        input: resources_test.resolve("pbmc_1k_protein_v3/pbmc_1k_protein_v3_filtered_feature_bc_matrix.h5mu"),
        min_counts: 3,
        max_counts: 100000,
        min_genes_per_cell: 2,
--- a/src/workflows/qc/qc/integration_test.sh
+++ b/src/workflows/qc/qc/integration_test.sh
@@ -1,22 +1,15 @@
 #!/bin/bash
 # get the root of the directory
 REPO_ROOT=$(git rev-parse --show-toplevel)
 # ensure that the command below is run from the root of the repository
 cd "$REPO_ROOT"
 export NXF_VER=21.10.6
 viash ns build -q '^workflows/qc/qc'
 nextflow \
  run . \
  -main-script src/workflows/qc/qc/test.nf \
  -entry test_wf \
  -resume \
  -profile docker,no_publish \
  -c src/workflows/utils/labels_ci.config \
  -c src/workflows/utils/integration_tests.config
--- a/src/workflows/qc/qc/main.nf
+++ b/src/workflows/qc/qc/main.nf
@@ -81,7 +81,7 @@ workflow run_wf {
            "compression": "gzip"
          ]
        },
-        auto: [ publish: true ]
+        toState: ["output": "output"]
      )
      | setState(["output"]) 
--- a/src/workflows/qc/qc/test.nf
+++ b/src/workflows/qc/qc/test.nf
@@ -7,15 +7,17 @@ params.resources_test = params.rootDir + "/resources_test"
 workflow test_wf {
  resources_test = file(params.resources_test)
  output_ch = 
    Channel.fromList([
      [
        id: "mouse_test",
-        input: file(params.resources_test).resolve("concat_test_data/e18_mouse_brain_fresh_5k_filtered_feature_bc_matrix_subset_unique_obs.h5mu"),
+        input: resources_test.resolve("concat_test_data/e18_mouse_brain_fresh_5k_filtered_feature_bc_matrix_subset_unique_obs.h5mu"),
      ],
      [
        id: "human_test",
-        input: file(params.resources_test).resolve("concat_test_data/human_brain_3k_filtered_feature_bc_matrix_subset_unique_obs.h5mu"),
+        input: resources_test.resolve("concat_test_data/human_brain_3k_filtered_feature_bc_matrix_subset_unique_obs.h5mu"),
      ]
    ])
    | map { state -> [state.id, state] }
--- a/src/workflows/rna/rna_multisample/integration_test.sh
+++ b/src/workflows/rna/rna_multisample/integration_test.sh
@@ -1,22 +1,15 @@
 #!/bin/bash
 # get the root of the directory
 REPO_ROOT=$(git rev-parse --show-toplevel)
 # ensure that the command below is run from the root of the repository
 cd "$REPO_ROOT"
-viash ns build -q rna_multisample
+nextflow \
-
+  run . \
 export NXF_VER=21.10.6
 nextflow run . \
  -main-script src/workflows/rna/rna_multisample/test.nf \
  -profile docker,no_publish \
  -resume \
  -entry test_wf \
-  -with-trace work/trace.txt \
+  -profile docker,no_publish \
  -c src/workflows/utils/labels_ci.config \
  -c src/workflows/utils/integration_tests.config
--- a/src/workflows/rna/rna_multisample/test.nf
+++ b/src/workflows/rna/rna_multisample/test.nf
@@ -6,10 +6,12 @@ params.resources_test = params.rootDir + "/resources_test"
 workflow test_wf {
  resources_test = file(params.resources_test)
  output_ch = Channel.fromList([
      [
        id: "simple_execution_test",
-        input: file(params.resources_test).resolve("concat_test_data/concatenated_brain_filtered_feature_bc_matrix_subset.h5mu"),
+        input: resources_test.resolve("concat_test_data/concatenated_brain_filtered_feature_bc_matrix_subset.h5mu"),
        output: "concatenated_file.final.h5mu"
      ]
    ])
--- a/src/workflows/rna/rna_singlesample/integration_test.sh
+++ b/src/workflows/rna/rna_singlesample/integration_test.sh
@@ -1,30 +1,23 @@
 #!/bin/bash
 # get the root of the directory
 REPO_ROOT=$(git rev-parse --show-toplevel)
 # ensure that the command below is run from the root of the repository
 cd "$REPO_ROOT"
-export NXF_VER=21.10.6
+nextflow \
-
+  run . \
 viash ns build -q rna_singlesample
 # viash ns build -q 'filter|publish|qc|metadata' --parallel --setup cb
 nextflow run . \
  -main-script src/workflows/rna/rna_singlesample/test.nf \
  -profile docker,no_publish \
  -entry test_wf \
-  -with-trace work/trace.txt \
+  -profile docker,no_publish \
  -c src/workflows/utils/labels_ci.config \
  -c src/workflows/utils/integration_tests.config
-nextflow run . \
+nextflow \
  run . \
  -entry test_wf2 \
  -main-script src/workflows/rna/rna_singlesample/test.nf \
  -profile docker,no_publish \
  -entry test_wf2 \
  -with-trace work/trace.txt \
  -c src/workflows/utils/labels_ci.config \
  -c src/workflows/utils/integration_tests.config
--- a/src/workflows/rna/rna_singlesample/main.nf
+++ b/src/workflows/rna/rna_singlesample/main.nf
@@ -133,8 +133,8 @@ workflow run_wf {
        "layer": "layer",
      ],
      args: [output_compression: "gzip"],
      auto: [ publish: true ]
    )
    | setState(["output": "output"])
  emit:
  output_ch
--- a/src/workflows/rna/rna_singlesample/test.nf
+++ b/src/workflows/rna/rna_singlesample/test.nf
@@ -5,13 +5,13 @@ include { rna_singlesample } from params.rootDir + "/target/nextflow/workflows/r
 params.resources_test = params.rootDir + "/resources_test"
 workflow test_wf {
-  // allow changing the resources_test dir
+
-  resources_test = file("${params.rootDir}/resources_test")
+  resources_test = file(params.resources_test)
  output_ch = Channel.fromList([
      [
        id: "mitochondrial_test",
-        input: file(params.resources_test).resolve("pbmc_1k_protein_v3/pbmc_1k_protein_v3_filtered_feature_bc_matrix.h5mu"),
+        input: resources_test.resolve("pbmc_1k_protein_v3/pbmc_1k_protein_v3_filtered_feature_bc_matrix.h5mu"),
        min_counts: 3,
        max_counts: 10000000,
        min_genes_per_cell: 2,
@@ -27,7 +27,7 @@ workflow test_wf {
      ],
      [
        id: "simple_execution_test",
-        input: file(params.resources_test).resolve("pbmc_1k_protein_v3/pbmc_1k_protein_v3_filtered_feature_bc_matrix.h5mu"),
+        input: resources_test.resolve("pbmc_1k_protein_v3/pbmc_1k_protein_v3_filtered_feature_bc_matrix.h5mu"),
        min_counts: 3,
        max_counts: 10000000,
        min_genes_per_cell: 2,
@@ -54,8 +54,8 @@ workflow test_wf {
 }
 workflow test_wf2 {
-  // allow changing the resources_test dir
+
-  resources_test = file("${params.rootDir}/resources_test")
+  resources_test = file(params.resources_test)
  output_ch = Channel.fromList([
      [
--- a/src/workflows/test_workflows/qc/config.vsh.yaml
+++ b/src/workflows/test_workflows/qc/config.vsh.yaml
@@ -39,4 +39,6 @@ engines:
        __merge__: /src/base/requirements/viashpy.yaml
 runners:
  - type: executable
-  - type: nextflow
+  - type: nextflow
    directives:
      label: [midmem, midcpu]
--- a/target/executable/annotate/celltypist/.config.vsh.yaml
+++ b/target/executable/annotate/celltypist/.config.vsh.yaml
@@ -0,0 +1,427 @@
 name: "celltypist"
 namespace: "annotate"
 version: "fix-integration-tests"
 authors:
 - name: "Jakub Majercik"
  roles:
  - "author"
  info:
    role: "Contributor"
    links:
      email: "jakub@data-intuitive.com"
      github: "jakubmajercik"
      linkedin: "jakubmajercik"
    organizations:
    - name: "Data Intuitive"
      href: "https://www.data-intuitive.com"
      role: "Bioinformatics Engineer"
 - name: "Weiwei Schultz"
  roles:
  - "contributor"
  info:
    role: "Contributor"
    organizations:
    - name: "Janssen R&D US"
      role: "Associate Director Data Sciences"
 argument_groups:
 - name: "Inputs"
  description: "Input dataset (query) arguments"
  arguments:
  - type: "file"
    name: "--input"
    alternatives:
    - "-i"
    description: "The input (query) data to be labeled. Should be a .h5mu file."
    info: null
    example:
    - "input.h5mu"
    must_exist: true
    create_parent: true
    required: true
    direction: "input"
    multiple: false
    multiple_sep: ";"
  - type: "string"
    name: "--modality"
    description: "Which modality to process."
    info: null
    default:
    - "rna"
    required: false
    direction: "input"
    multiple: false
    multiple_sep: ";"
  - type: "string"
    name: "--input_layer"
    description: "The layer in the input data to be used for cell type annotation\
      \ if .X is not to be used."
    info: null
    required: false
    direction: "input"
    multiple: false
    multiple_sep: ";"
  - type: "string"
    name: "--var_query_gene_names"
    description: "The name of the adata var column in the input data containing gene\
      \ names; when no gene_name_layer is provided, the var index will be used.\n"
    info: null
    required: false
    direction: "input"
    multiple: false
    multiple_sep: ";"
 - name: "Reference"
  description: "Arguments related to the reference dataset."
  arguments:
  - type: "file"
    name: "--reference"
    description: "The reference data to train the CellTypist classifiers on. Only\
      \ required if a pre-trained --model is not provided."
    info: null
    example:
    - "reference.h5mu"
    must_exist: true
    create_parent: true
    required: false
    direction: "input"
    multiple: false
    multiple_sep: ";"
  - type: "string"
    name: "--reference_layer"
    description: "The layer in the reference data to be used for cell type annotation\
      \ if .X is not to be used. Data are expected to be processed in the same way\
      \ as the --input query dataset."
    info: null
    required: false
    direction: "input"
    multiple: false
    multiple_sep: ";"
  - type: "string"
    name: "--reference_obs_target"
    description: "The name of the adata obs column in the reference data containing\
      \ cell type annotations."
    info: null
    default:
    - "cell_ontology_class"
    required: false
    direction: "input"
    multiple: false
    multiple_sep: ";"
  - type: "boolean_true"
    name: "--check_expression"
    description: "Whether to check the expression of the reference dataset to the\
      \ format reccomended by CellTypist.\nCellTypist requires data to be log-normalized\
      \ to 10000 counts per cell.\n"
    info: null
    direction: "input"
  - type: "string"
    name: "--var_reference_gene_names"
    description: "The name of the adata var column in the reference data containing\
      \ gene names; when no gene_name_layer is provided, the var index will be used.\n"
    info: null
    required: false
    direction: "input"
    multiple: false
    multiple_sep: ";"
 - name: "Model arguments"
  description: "Model arguments."
  arguments:
  - type: "file"
    name: "--model"
    description: "Pretrained model in pkl format. If not provided, the model will\
      \ be trained on the reference data and --reference should be provided."
    info: null
    example:
    - "pretrained_model.pkl"
    must_exist: true
    create_parent: true
    required: false
    direction: "input"
    multiple: false
    multiple_sep: ";"
  - type: "boolean"
    name: "--feature_selection"
    description: "Whether to perform feature selection."
    info: null
    default:
    - false
    required: false
    direction: "input"
    multiple: false
    multiple_sep: ";"
  - type: "boolean"
    name: "--majority_voting"
    description: "Whether to refine the predicted labels by running the majority voting\
      \ classifier after over-clustering."
    info: null
    default:
    - false
    required: false
    direction: "input"
    multiple: false
    multiple_sep: ";"
  - type: "double"
    name: "--C"
    description: "Inverse of regularization strength in logistic regression."
    info: null
    default:
    - 1.0
    required: false
    direction: "input"
    multiple: false
    multiple_sep: ";"
  - type: "integer"
    name: "--max_iter"
    description: "Maximum number of iterations before reaching the minimum of the\
      \ cost function."
    info: null
    default:
    - 1000
    required: false
    direction: "input"
    multiple: false
    multiple_sep: ";"
  - type: "boolean_true"
    name: "--use_SGD"
    description: "Whether to use the stochastic gradient descent algorithm."
    info: null
    direction: "input"
  - type: "double"
    name: "--min_prop"
    description: "\"For the dominant cell type within a subcluster, the minimum proportion\
      \ of cells required to \nsupport naming of the subcluster by this cell type.\
      \ Ignored if majority_voting is set to False. \nSubcluster that fails to pass\
      \ this proportion threshold will be assigned 'Heterogeneous'.\"\n"
    info: null
    default:
    - 0.0
    required: false
    direction: "input"
    multiple: false
    multiple_sep: ";"
 - name: "Outputs"
  description: "Output arguments."
  arguments:
  - type: "file"
    name: "--output"
    description: "Output h5mu file."
    info: null
    example:
    - "output.h5mu"
    must_exist: true
    create_parent: true
    required: false
    direction: "output"
    multiple: false
    multiple_sep: ";"
  - type: "string"
    name: "--output_compression"
    info: null
    example:
    - "gzip"
    required: false
    choices:
    - "gzip"
    - "lzf"
    direction: "input"
    multiple: false
    multiple_sep: ";"
  - type: "string"
    name: "--output_obs_predictions"
    description: "In which `.obs` slots to store the predicted information.\n"
    info: null
    default:
    - "celltypist_pred"
    required: false
    direction: "input"
    multiple: false
    multiple_sep: ";"
  - type: "string"
    name: "--output_obs_probability"
    description: "In which `.obs` slots to store the probability of the predictions.\n"
    info: null
    default:
    - "celltypist_probability"
    required: false
    direction: "input"
    multiple: false
    multiple_sep: ";"
 resources:
 - type: "python_script"
  path: "script.py"
  is_executable: true
 - type: "file"
  path: "setup_logger.py"
 - type: "file"
  path: "nextflow_labels.config"
  dest: "nextflow_labels.config"
 description: "Automated cell type annotation tool for scRNA-seq datasets on the basis\
  \ of logistic regression classifiers optimised by the stochastic gradient descent\
  \ algorithm."
 test_resources:
 - type: "python_script"
  path: "test.py"
  is_executable: true
 - type: "file"
  path: "annotation_test_data"
 - type: "file"
  path: "pbmc_1k_protein_v3"
 - type: "file"
  path: "openpipelinetestutils"
  dest: "openpipelinetestutils"
 info: null
 status: "enabled"
 links:
  repository: "https://github.com/openpipelines-bio/openpipeline"
  docker_registry: "ghcr.io"
 runners:
 - type: "executable"
  id: "executable"
  docker_setup_strategy: "ifneedbepullelsecachedbuild"
 - type: "nextflow"
  id: "nextflow"
  directives:
    tag: "$id"
  auto:
    simplifyInput: true
    simplifyOutput: false
    transcript: false
    publish: false
  config:
    labels:
      mem1gb: "memory = 1000000000.B"
      mem2gb: "memory = 2000000000.B"
      mem5gb: "memory = 5000000000.B"
      mem10gb: "memory = 10000000000.B"
      mem20gb: "memory = 20000000000.B"
      mem50gb: "memory = 50000000000.B"
      mem100gb: "memory = 100000000000.B"
      mem200gb: "memory = 200000000000.B"
      mem500gb: "memory = 500000000000.B"
      mem1tb: "memory = 1000000000000.B"
      mem2tb: "memory = 2000000000000.B"
      mem5tb: "memory = 5000000000000.B"
      mem10tb: "memory = 10000000000000.B"
      mem20tb: "memory = 20000000000000.B"
      mem50tb: "memory = 50000000000000.B"
      mem100tb: "memory = 100000000000000.B"
      mem200tb: "memory = 200000000000000.B"
      mem500tb: "memory = 500000000000000.B"
      mem1gib: "memory = 1073741824.B"
      mem2gib: "memory = 2147483648.B"
      mem4gib: "memory = 4294967296.B"
      mem8gib: "memory = 8589934592.B"
      mem16gib: "memory = 17179869184.B"
      mem32gib: "memory = 34359738368.B"
      mem64gib: "memory = 68719476736.B"
      mem128gib: "memory = 137438953472.B"
      mem256gib: "memory = 274877906944.B"
      mem512gib: "memory = 549755813888.B"
      mem1tib: "memory = 1099511627776.B"
      mem2tib: "memory = 2199023255552.B"
      mem4tib: "memory = 4398046511104.B"
      mem8tib: "memory = 8796093022208.B"
      mem16tib: "memory = 17592186044416.B"
      mem32tib: "memory = 35184372088832.B"
      mem64tib: "memory = 70368744177664.B"
      mem128tib: "memory = 140737488355328.B"
      mem256tib: "memory = 281474976710656.B"
      mem512tib: "memory = 562949953421312.B"
      cpu1: "cpus = 1"
      cpu2: "cpus = 2"
      cpu5: "cpus = 5"
      cpu10: "cpus = 10"
      cpu20: "cpus = 20"
      cpu50: "cpus = 50"
      cpu100: "cpus = 100"
      cpu200: "cpus = 200"
      cpu500: "cpus = 500"
      cpu1000: "cpus = 1000"
    script:
    - "includeConfig(\"nextflow_labels.config\")"
  debug: false
  container: "docker"
 engines:
 - type: "docker"
  id: "docker"
  image: "python:3.10-slim"
  target_registry: "images.viash-hub.com"
  target_tag: "fix-integration-tests"
  namespace_separator: "/"
  setup:
  - type: "apt"
    packages:
    - "libhdf5-dev"
    - "procps"
    interactive: false
  - type: "python"
    user: false
    packages:
    - "scanpy~=1.9.6"
    upgrade: true
  - type: "python"
    user: false
    packages:
    - "celltypist==1.6.3"
    upgrade: true
  - type: "python"
    user: false
    packages:
    - "anndata==0.10.8"
    - "mudata~=0.2.4"
    - "pandas!=2.1.2"
    - "numpy<2.0.0"
    upgrade: true
  test_setup:
  - type: "docker"
    copy:
    - "openpipelinetestutils /opt/openpipelinetestutils"
  - type: "python"
    user: false
    packages:
    - "/opt/openpipelinetestutils"
    upgrade: true
  - type: "python"
    user: false
    packages:
    - "viashpy==0.8.0"
    upgrade: true
  entrypoint: []
  cmd: null
 - type: "native"
  id: "native"
 build_info:
  config: "src/annotate/celltypist/config.vsh.yaml"
  runner: "executable"
  engine: "docker|native"
  output: "target/executable/annotate/celltypist"
  executable: "target/executable/annotate/celltypist/celltypist"
  viash_version: "0.9.0"
  git_commit: "da62b4ffe30b6ef36fcb7ef5944f29d45d1138ff"
  git_remote: "https://x-access-token:ghs_WgbTvvspBKMSQ5BPucz45vMfHrxqK54Vys9e@github.com/openpipelines-bio/openpipeline"
  git_tag: "0.2.0-1939-gda62b4ff"
 package_config:
  name: "openpipeline"
  version: "fix-integration-tests"
  info:
    test_resources:
    - type: "s3"
      path: "s3://openpipelines-data"
      dest: "resources_test"
  viash_version: "0.9.0"
  source: "src"
  target: "target"
  config_mods:
  - ".test_resources += {path: '/src/base/openpipelinetestutils', dest: 'openpipelinetestutils'}\n\
    .resources += {path: '/src/workflows/utils/labels.config', dest: 'nextflow_labels.config'}\n\
    .runners[.type == 'nextflow'].directives.tag := '$id'\n.runners[.type == 'nextflow'].config.script\
    \ := 'includeConfig(\"nextflow_labels.config\")'\n"
  - ".engines += { type: \"native\" }"
  - ".engines[.type == 'docker'].target_registry := 'images.viash-hub.com'"
  - ".engines[.type == 'docker'].target_tag := 'fix-integration-tests'"
  organization: "vsh"
  links:
    repository: "https://github.com/openpipelines-bio/openpipeline"
    docker_registry: "ghcr.io"
    homepage: "https://openpipelines.bio"
    documentation: "https://openpipelines.bio/fundamentals"
    issue_tracker: "https://github.com/openpipelines-bio/openpipeline/issues"
--- a/target/executable/annotate/celltypist/celltypist
+++ b/target/executable/annotate/celltypist/celltypist
--- a/target/executable/annotate/celltypist/nextflow_labels.config
+++ b/target/executable/annotate/celltypist/nextflow_labels.config
@@ -0,0 +1,42 @@
 process {
  // Default resources for components that hardly do any processing
  memory = { 2.GB * task.attempt }
  cpus = 1
  // Retry for exit codes that have something to do with memory issues
  errorStrategy = { task.exitStatus in 137..140 ? 'retry' : 'terminate' }
  maxRetries = 3
  maxMemory = null
  // Resource labels
  withLabel: singlecpu { cpus = 1 }
  withLabel: lowcpu { cpus = 4 }
  withLabel: midcpu { cpus = 10 }
  withLabel: highcpu { cpus = 20 }
  withLabel: lowmem { memory = { get_memory( 4.GB * task.attempt ) } }
  withLabel: midmem { memory = { get_memory( 25.GB * task.attempt ) } }
  withLabel: highmem { memory = { get_memory( 50.GB * task.attempt ) } }
  withLabel: veryhighmem { memory = { get_memory( 75.GB * task.attempt ) } }
 }
 def get_memory(to_compare) {
  if (!process.containsKey("maxMemory") || !process.maxMemory) {
    return to_compare
  }
  try {
    if (process.containsKey("maxRetries") && process.maxRetries && task.attempt == (process.maxRetries as int)) {
      return process.maxMemory
    }
    else if (to_compare.compareTo(process.maxMemory as nextflow.util.MemoryUnit) == 1) {
      return max_memory as nextflow.util.MemoryUnit
    }
    else {
      return to_compare
    }  
  } catch (all) {
        println "Error processing memory resources. Please check that process.maxMemory '${process.maxMemory}' and process.maxRetries '${process.maxRetries}' are valid!"
        System.exit(1)
  }
 }
--- a/target/executable/annotate/celltypist/setup_logger.py
+++ b/target/executable/annotate/celltypist/setup_logger.py
@@ -0,0 +1,12 @@
 def setup_logger():
    import logging
    from sys import stdout
    logger = logging.getLogger()
    logger.setLevel(logging.INFO)
    console_handler = logging.StreamHandler(stdout)
    logFormatter = logging.Formatter("%(asctime)s %(levelname)-8s %(message)s")
    console_handler.setFormatter(logFormatter)
    logger.addHandler(console_handler)
    return logger
--- a/target/executable/annotate/onclass/.config.vsh.yaml
+++ b/target/executable/annotate/onclass/.config.vsh.yaml
@@ -0,0 +1,367 @@
 name: "onclass"
 namespace: "annotate"
 version: "fix-integration-tests"
 authors:
 - name: "Jakub Majercik"
  roles:
  - "author"
  info:
    role: "Contributor"
    links:
      email: "jakub@data-intuitive.com"
      github: "jakubmajercik"
      linkedin: "jakubmajercik"
    organizations:
    - name: "Data Intuitive"
      href: "https://www.data-intuitive.com"
      role: "Bioinformatics Engineer"
 argument_groups:
 - name: "Inputs"
  description: "Input dataset (query) arguments"
  arguments:
  - type: "file"
    name: "--input"
    alternatives:
    - "-i"
    description: "The input (query) data to be labeled. Should be a .h5mu file."
    info: null
    example:
    - "input.h5mu"
    must_exist: true
    create_parent: true
    required: true
    direction: "input"
    multiple: false
    multiple_sep: ";"
  - type: "string"
    name: "--modality"
    description: "Which modality to process."
    info: null
    default:
    - "rna"
    required: false
    direction: "input"
    multiple: false
    multiple_sep: ";"
  - type: "string"
    name: "--input_layer"
    description: "The layer in the input data to be used for cell type annotation\
      \ if .X is not to be used."
    info: null
    required: false
    direction: "input"
    multiple: false
    multiple_sep: ";"
  - type: "file"
    name: "--cl_nlp_emb_file"
    description: "The .nlp.emb file with the cell type embeddings."
    info: null
    must_exist: true
    create_parent: true
    required: true
    direction: "input"
    multiple: false
    multiple_sep: ";"
  - type: "file"
    name: "--cl_ontology_file"
    description: "The .ontology file with the cell type ontology."
    info: null
    must_exist: true
    create_parent: true
    required: true
    direction: "input"
    multiple: false
    multiple_sep: ";"
  - type: "file"
    name: "--cl_obo_file"
    description: "The .obo file with the cell type ontology."
    info: null
    must_exist: true
    create_parent: true
    required: true
    direction: "input"
    multiple: false
    multiple_sep: ";"
  - type: "string"
    name: "--var_query_gene_names"
    description: "The name of the adata var column in the input data containing gene\
      \ names; when no gene_name_layer is provided, the var index will be used.\n"
    info: null
    required: false
    direction: "input"
    multiple: false
    multiple_sep: ";"
 - name: "Reference"
  description: "Arguments related to the reference dataset."
  arguments:
  - type: "file"
    name: "--reference"
    description: "The reference data to train the CellTypist classifiers on. Only\
      \ required if a pre-trained --model is not provided."
    info: null
    example:
    - "reference.h5mu"
    must_exist: true
    create_parent: true
    required: false
    direction: "input"
    multiple: false
    multiple_sep: ";"
  - type: "string"
    name: "--reference_layer"
    description: "The layer in the reference data to be used for cell type annotation\
      \ if .X is not to be used."
    info: null
    required: false
    direction: "input"
    multiple: false
    multiple_sep: ";"
  - type: "string"
    name: "--reference_obs_target"
    description: "The name of the adata obs column in the reference data containing\
      \ cell type annotations."
    info: null
    example:
    - "cell_ontology_class"
    required: true
    direction: "input"
    multiple: false
    multiple_sep: ";"
 - name: "Outputs"
  description: "Output arguments."
  arguments:
  - type: "file"
    name: "--output"
    description: "Output h5mu file."
    info: null
    example:
    - "output.h5mu"
    must_exist: true
    create_parent: true
    required: false
    direction: "output"
    multiple: false
    multiple_sep: ";"
  - type: "string"
    name: "--output_compression"
    info: null
    example:
    - "gzip"
    required: false
    choices:
    - "gzip"
    - "lzf"
    direction: "input"
    multiple: false
    multiple_sep: ";"
  - type: "string"
    name: "--output_obs_predictions"
    description: "In which `.obs` slots to store the predicted information.\n"
    info: null
    default:
    - "onclass_pred"
    required: false
    direction: "input"
    multiple: false
    multiple_sep: ";"
  - type: "string"
    name: "--output_obs_probability"
    description: "In which `.obs` slots to store the probability of the predictions.\n"
    info: null
    default:
    - "onclass_prob"
    required: false
    direction: "input"
    multiple: false
    multiple_sep: ";"
 - name: "Model arguments"
  description: "Model arguments"
  arguments:
  - type: "string"
    name: "--model"
    description: "\"Pretrained model path without a file extension. If not provided,\
      \ the model will be trained \non the reference data and --reference should be\
      \ provided. The path namespace should contain:\n  - a .npz or .pkl file\n  -\
      \ a .data file\n  - a .meta file\n  - a .index file\ne.g. /path/to/model/pretrained_model_target1\
      \ as saved by OnClass.\"\n"
    info: null
    required: false
    direction: "input"
    multiple: false
    multiple_sep: ";"
  - type: "integer"
    name: "--max_iter"
    description: "Maximum number of iterations for training the model."
    info: null
    default:
    - 30
    required: false
    direction: "input"
    multiple: false
    multiple_sep: ";"
 resources:
 - type: "python_script"
  path: "script.py"
  is_executable: true
 - type: "file"
  path: "setup_logger.py"
 - type: "file"
  path: "nextflow_labels.config"
  dest: "nextflow_labels.config"
 description: "OnClass is a python package for single-cell cell type annotation. It\
  \ uses the Cell Ontology to capture the cell type similarity. \nThese similarities\
  \ enable OnClass to annotate cell types that are never seen in the training data.\n"
 test_resources:
 - type: "python_script"
  path: "test.py"
  is_executable: true
 - type: "file"
  path: "annotation_test_data"
 - type: "file"
  path: "pbmc_1k_protein_v3"
 - type: "file"
  path: "openpipelinetestutils"
  dest: "openpipelinetestutils"
 info: null
 status: "enabled"
 links:
  repository: "https://github.com/openpipelines-bio/openpipeline"
  docker_registry: "ghcr.io"
 runners:
 - type: "executable"
  id: "executable"
  docker_setup_strategy: "ifneedbepullelsecachedbuild"
 - type: "nextflow"
  id: "nextflow"
  directives:
    tag: "$id"
  auto:
    simplifyInput: true
    simplifyOutput: false
    transcript: false
    publish: false
  config:
    labels:
      mem1gb: "memory = 1000000000.B"
      mem2gb: "memory = 2000000000.B"
      mem5gb: "memory = 5000000000.B"
      mem10gb: "memory = 10000000000.B"
      mem20gb: "memory = 20000000000.B"
      mem50gb: "memory = 50000000000.B"
      mem100gb: "memory = 100000000000.B"
      mem200gb: "memory = 200000000000.B"
      mem500gb: "memory = 500000000000.B"
      mem1tb: "memory = 1000000000000.B"
      mem2tb: "memory = 2000000000000.B"
      mem5tb: "memory = 5000000000000.B"
      mem10tb: "memory = 10000000000000.B"
      mem20tb: "memory = 20000000000000.B"
      mem50tb: "memory = 50000000000000.B"
      mem100tb: "memory = 100000000000000.B"
      mem200tb: "memory = 200000000000000.B"
      mem500tb: "memory = 500000000000000.B"
      mem1gib: "memory = 1073741824.B"
      mem2gib: "memory = 2147483648.B"
      mem4gib: "memory = 4294967296.B"
      mem8gib: "memory = 8589934592.B"
      mem16gib: "memory = 17179869184.B"
      mem32gib: "memory = 34359738368.B"
      mem64gib: "memory = 68719476736.B"
      mem128gib: "memory = 137438953472.B"
      mem256gib: "memory = 274877906944.B"
      mem512gib: "memory = 549755813888.B"
      mem1tib: "memory = 1099511627776.B"
      mem2tib: "memory = 2199023255552.B"
      mem4tib: "memory = 4398046511104.B"
      mem8tib: "memory = 8796093022208.B"
      mem16tib: "memory = 17592186044416.B"
      mem32tib: "memory = 35184372088832.B"
      mem64tib: "memory = 70368744177664.B"
      mem128tib: "memory = 140737488355328.B"
      mem256tib: "memory = 281474976710656.B"
      mem512tib: "memory = 562949953421312.B"
      cpu1: "cpus = 1"
      cpu2: "cpus = 2"
      cpu5: "cpus = 5"
      cpu10: "cpus = 10"
      cpu20: "cpus = 20"
      cpu50: "cpus = 50"
      cpu100: "cpus = 100"
      cpu200: "cpus = 200"
      cpu500: "cpus = 500"
      cpu1000: "cpus = 1000"
    script:
    - "includeConfig(\"nextflow_labels.config\")"
  debug: false
  container: "docker"
 engines:
 - type: "docker"
  id: "docker"
  image: "python:3.8"
  target_registry: "images.viash-hub.com"
  target_tag: "fix-integration-tests"
  namespace_separator: "/"
  setup:
  - type: "python"
    user: false
    packages:
    - "scikit-learn==0.24.0"
    - "OnClass==1.2"
    - "tensorflow==2.13.1"
    - "obonet==1.1.0"
    - "mudata"
    upgrade: true
  test_setup:
  - type: "docker"
    copy:
    - "openpipelinetestutils /opt/openpipelinetestutils"
  - type: "python"
    user: false
    packages:
    - "/opt/openpipelinetestutils"
    upgrade: true
  - type: "python"
    user: false
    packages:
    - "viashpy==0.8.0"
    upgrade: true
  entrypoint: []
  cmd: null
 - type: "native"
  id: "native"
 build_info:
  config: "src/annotate/onclass/config.vsh.yaml"
  runner: "executable"
  engine: "docker|native"
  output: "target/executable/annotate/onclass"
  executable: "target/executable/annotate/onclass/onclass"
  viash_version: "0.9.0"
  git_commit: "da62b4ffe30b6ef36fcb7ef5944f29d45d1138ff"
  git_remote: "https://x-access-token:ghs_WgbTvvspBKMSQ5BPucz45vMfHrxqK54Vys9e@github.com/openpipelines-bio/openpipeline"
  git_tag: "0.2.0-1939-gda62b4ff"
 package_config:
  name: "openpipeline"
  version: "fix-integration-tests"
  info:
    test_resources:
    - type: "s3"
      path: "s3://openpipelines-data"
      dest: "resources_test"
  viash_version: "0.9.0"
  source: "src"
  target: "target"
  config_mods:
  - ".test_resources += {path: '/src/base/openpipelinetestutils', dest: 'openpipelinetestutils'}\n\
    .resources += {path: '/src/workflows/utils/labels.config', dest: 'nextflow_labels.config'}\n\
    .runners[.type == 'nextflow'].directives.tag := '$id'\n.runners[.type == 'nextflow'].config.script\
    \ := 'includeConfig(\"nextflow_labels.config\")'\n"
  - ".engines += { type: \"native\" }"
  - ".engines[.type == 'docker'].target_registry := 'images.viash-hub.com'"
  - ".engines[.type == 'docker'].target_tag := 'fix-integration-tests'"
  organization: "vsh"
  links:
    repository: "https://github.com/openpipelines-bio/openpipeline"
    docker_registry: "ghcr.io"
    homepage: "https://openpipelines.bio"
    documentation: "https://openpipelines.bio/fundamentals"
    issue_tracker: "https://github.com/openpipelines-bio/openpipeline/issues"
--- a/target/executable/annotate/onclass/nextflow_labels.config
+++ b/target/executable/annotate/onclass/nextflow_labels.config
@@ -0,0 +1,42 @@
 process {
  // Default resources for components that hardly do any processing
  memory = { 2.GB * task.attempt }
  cpus = 1
  // Retry for exit codes that have something to do with memory issues
  errorStrategy = { task.exitStatus in 137..140 ? 'retry' : 'terminate' }
  maxRetries = 3
  maxMemory = null
  // Resource labels
  withLabel: singlecpu { cpus = 1 }
  withLabel: lowcpu { cpus = 4 }
  withLabel: midcpu { cpus = 10 }
  withLabel: highcpu { cpus = 20 }
  withLabel: lowmem { memory = { get_memory( 4.GB * task.attempt ) } }
  withLabel: midmem { memory = { get_memory( 25.GB * task.attempt ) } }
  withLabel: highmem { memory = { get_memory( 50.GB * task.attempt ) } }
  withLabel: veryhighmem { memory = { get_memory( 75.GB * task.attempt ) } }
 }
 def get_memory(to_compare) {
  if (!process.containsKey("maxMemory") || !process.maxMemory) {
    return to_compare
  }
  try {
    if (process.containsKey("maxRetries") && process.maxRetries && task.attempt == (process.maxRetries as int)) {
      return process.maxMemory
    }
    else if (to_compare.compareTo(process.maxMemory as nextflow.util.MemoryUnit) == 1) {
      return max_memory as nextflow.util.MemoryUnit
    }
    else {
      return to_compare
    }  
  } catch (all) {
        println "Error processing memory resources. Please check that process.maxMemory '${process.maxMemory}' and process.maxRetries '${process.maxRetries}' are valid!"
        System.exit(1)
  }
 }
--- a/target/executable/annotate/onclass/onclass
+++ b/target/executable/annotate/onclass/onclass
--- a/target/executable/annotate/onclass/setup_logger.py
+++ b/target/executable/annotate/onclass/setup_logger.py
@@ -0,0 +1,12 @@
 def setup_logger():
    import logging
    from sys import stdout
    logger = logging.getLogger()
    logger.setLevel(logging.INFO)
    console_handler = logging.StreamHandler(stdout)
    logFormatter = logging.Formatter("%(asctime)s %(levelname)-8s %(message)s")
    console_handler.setFormatter(logFormatter)
    logger.addHandler(console_handler)
    return logger
--- a/target/executable/annotate/popv/.config.vsh.yaml
+++ b/target/executable/annotate/popv/.config.vsh.yaml
@@ -0,0 +1,387 @@
 name: "popv"
 namespace: "annotate"
 version: "fix-integration-tests"
 authors:
 - name: "Matthias Beyens"
  roles:
  - "author"
  info:
    role: "Contributor"
    links:
      github: "MatthiasBeyens"
      orcid: "0000-0003-3304-0706"
      email: "matthias.beyens@gmail.com"
      linkedin: "mbeyens"
    organizations:
    - name: "Janssen Pharmaceuticals"
      href: "https://www.janssen.com"
      role: "Principal Scientist"
 - name: "Robrecht Cannoodt"
  roles:
  - "author"
  info:
    role: "Core Team Member"
    links:
      email: "robrecht@data-intuitive.com"
      github: "rcannood"
      orcid: "0000-0003-3641-729X"
      linkedin: "robrechtcannoodt"
    organizations:
    - name: "Data Intuitive"
      href: "https://www.data-intuitive.com"
      role: "Data Science Engineer"
    - name: "Open Problems"
      href: "https://openproblems.bio"
      role: "Core Member"
 argument_groups:
 - name: "Inputs"
  description: "Arguments related to the input (aka query) dataset."
  arguments:
  - type: "file"
    name: "--input"
    alternatives:
    - "-i"
    description: "Input h5mu file."
    info: null
    example:
    - "input.h5mu"
    must_exist: true
    create_parent: true
    required: true
    direction: "input"
    multiple: false
    multiple_sep: ";"
  - type: "string"
    name: "--modality"
    description: "Which modality to process."
    info: null
    default:
    - "rna"
    required: false
    direction: "input"
    multiple: false
    multiple_sep: ";"
  - type: "string"
    name: "--input_layer"
    description: "Which layer to use. If no value is provided, the counts are assumed\
      \ to be in the `.X` slot. Otherwise, count data is expected to be in `.layers[input_layer]`."
    info: null
    required: false
    direction: "input"
    multiple: false
    multiple_sep: ";"
  - type: "string"
    name: "--input_obs_batch"
    description: "Key in obs field of input adata for batch information. If no value\
      \ is provided, batch label is assumed to be unknown."
    info: null
    required: false
    direction: "input"
    multiple: false
    multiple_sep: ";"
  - type: "string"
    name: "--input_var_subset"
    description: "Subset the input object with this column."
    info: null
    required: false
    direction: "input"
    multiple: false
    multiple_sep: ";"
  - type: "string"
    name: "--input_obs_label"
    description: "Key in obs field of input adata for label information. This is only\
      \ used for training scANVI. Unlabelled cells should be set to `\"unknown_celltype_label\"\
      `."
    info: null
    required: false
    direction: "input"
    multiple: false
    multiple_sep: ";"
  - type: "string"
    name: "--unknown_celltype_label"
    description: "If `input_obs_label` is specified, cells with this value will be\
      \ treated as unknown and will be predicted by the model."
    info: null
    default:
    - "unknown"
    required: false
    direction: "input"
    multiple: false
    multiple_sep: ";"
 - name: "Reference"
  description: "Arguments related to the reference dataset."
  arguments:
  - type: "file"
    name: "--reference"
    description: "User-provided reference tissue. The data that will be used as reference\
      \ to call cell types."
    info: null
    example:
    - "TS_Bladder_filtered.h5ad"
    must_exist: true
    create_parent: true
    required: true
    direction: "input"
    multiple: false
    multiple_sep: ";"
  - type: "string"
    name: "--reference_layer"
    description: "Which layer to use. If no value is provided, the counts are assumed\
      \ to be in the `.X` slot. Otherwise, count data is expected to be in `.layers[reference_layer]`."
    info: null
    required: false
    direction: "input"
    multiple: false
    multiple_sep: ";"
  - type: "string"
    name: "--reference_obs_label"
    description: "Key in obs field of reference AnnData with cell-type information."
    info: null
    default:
    - "cell_ontology_class"
    required: false
    direction: "input"
    multiple: false
    multiple_sep: ";"
  - type: "string"
    name: "--reference_obs_batch"
    description: "Key in obs field of input adata for batch information."
    info: null
    default:
    - "donor_assay"
    required: false
    direction: "input"
    multiple: false
    multiple_sep: ";"
 - name: "Outputs"
  description: "Output arguments."
  arguments:
  - type: "file"
    name: "--output"
    description: "Output h5mu file."
    info: null
    example:
    - "output.h5mu"
    must_exist: true
    create_parent: true
    required: true
    direction: "output"
    multiple: false
    multiple_sep: ";"
  - type: "string"
    name: "--output_compression"
    info: null
    example:
    - "gzip"
    required: false
    choices:
    - "gzip"
    - "lzf"
    direction: "input"
    multiple: false
    multiple_sep: ";"
 - name: "Arguments"
  description: "Other arguments."
  arguments:
  - type: "string"
    name: "--methods"
    description: "Methods to call cell types. By default, runs to knn_on_scvi and\
      \ scanvi."
    info: null
    example:
    - "knn_on_scvi"
    - "scanvi"
    required: true
    choices:
    - "celltypist"
    - "knn_on_bbknn"
    - "knn_on_scanorama"
    - "knn_on_scvi"
    - "onclass"
    - "rf"
    - "scanvi"
    - "svm"
    direction: "input"
    multiple: true
    multiple_sep: ";"
 resources:
 - type: "python_script"
  path: "script.py"
  is_executable: true
 - type: "file"
  path: "setup_logger.py"
 - type: "file"
  path: "nextflow_labels.config"
  dest: "nextflow_labels.config"
 description: "Performs popular major vote cell typing on single cell sequence data\
  \ using multiple algorithms. Note that this is a one-shot version of PopV."
 test_resources:
 - type: "python_script"
  path: "test.py"
  is_executable: true
 - type: "file"
  path: "annotation_test_data"
 - type: "file"
  path: "pbmc_1k_protein_v3"
 - type: "file"
  path: "openpipelinetestutils"
  dest: "openpipelinetestutils"
 info: null
 status: "enabled"
 links:
  repository: "https://github.com/openpipelines-bio/openpipeline"
  docker_registry: "ghcr.io"
 runners:
 - type: "executable"
  id: "executable"
  docker_setup_strategy: "ifneedbepullelsecachedbuild"
 - type: "nextflow"
  id: "nextflow"
  directives:
    label:
    - "highmem"
    - "highcpu"
    tag: "$id"
  auto:
    simplifyInput: true
    simplifyOutput: false
    transcript: false
    publish: false
  config:
    labels:
      mem1gb: "memory = 1000000000.B"
      mem2gb: "memory = 2000000000.B"
      mem5gb: "memory = 5000000000.B"
      mem10gb: "memory = 10000000000.B"
      mem20gb: "memory = 20000000000.B"
      mem50gb: "memory = 50000000000.B"
      mem100gb: "memory = 100000000000.B"
      mem200gb: "memory = 200000000000.B"
      mem500gb: "memory = 500000000000.B"
      mem1tb: "memory = 1000000000000.B"
      mem2tb: "memory = 2000000000000.B"
      mem5tb: "memory = 5000000000000.B"
      mem10tb: "memory = 10000000000000.B"
      mem20tb: "memory = 20000000000000.B"
      mem50tb: "memory = 50000000000000.B"
      mem100tb: "memory = 100000000000000.B"
      mem200tb: "memory = 200000000000000.B"
      mem500tb: "memory = 500000000000000.B"
      mem1gib: "memory = 1073741824.B"
      mem2gib: "memory = 2147483648.B"
      mem4gib: "memory = 4294967296.B"
      mem8gib: "memory = 8589934592.B"
      mem16gib: "memory = 17179869184.B"
      mem32gib: "memory = 34359738368.B"
      mem64gib: "memory = 68719476736.B"
      mem128gib: "memory = 137438953472.B"
      mem256gib: "memory = 274877906944.B"
      mem512gib: "memory = 549755813888.B"
      mem1tib: "memory = 1099511627776.B"
      mem2tib: "memory = 2199023255552.B"
      mem4tib: "memory = 4398046511104.B"
      mem8tib: "memory = 8796093022208.B"
      mem16tib: "memory = 17592186044416.B"
      mem32tib: "memory = 35184372088832.B"
      mem64tib: "memory = 70368744177664.B"
      mem128tib: "memory = 140737488355328.B"
      mem256tib: "memory = 281474976710656.B"
      mem512tib: "memory = 562949953421312.B"
      cpu1: "cpus = 1"
      cpu2: "cpus = 2"
      cpu5: "cpus = 5"
      cpu10: "cpus = 10"
      cpu20: "cpus = 20"
      cpu50: "cpus = 50"
      cpu100: "cpus = 100"
      cpu200: "cpus = 200"
      cpu500: "cpus = 500"
      cpu1000: "cpus = 1000"
    script:
    - "includeConfig(\"nextflow_labels.config\")"
  debug: false
  container: "docker"
 engines:
 - type: "docker"
  id: "docker"
  image: "python:3.9-slim"
  target_registry: "images.viash-hub.com"
  target_tag: "fix-integration-tests"
  namespace_separator: "/"
  setup:
  - type: "apt"
    packages:
    - "procps"
    - "git"
    - "build-essential"
    - "wget"
    interactive: false
  - type: "python"
    user: false
    packages:
    - "scanpy~=1.9.6"
    - "scvi-tools~=1.0.3"
    - "popv~=0.3.2"
    - "jax==0.4.10"
    - "jaxlib==0.4.10"
    - "ml-dtypes<0.3.0"
    - "scipy==1.12.0"
    upgrade: true
  - type: "python"
    user: false
    packages:
    - "anndata==0.10.8"
    - "mudata~=0.2.4"
    - "pandas!=2.1.2"
    - "numpy<2.0.0"
    upgrade: true
  - type: "docker"
    run:
    - "cd /opt && git clone --depth 1 https://github.com/YosefLab/PopV.git && \\\n\
      \  cd PopV && git fetch --depth 1 origin tag v0.2 && git checkout v0.2\n"
  test_setup:
  - type: "python"
    user: false
    packages:
    - "viashpy==0.8.0"
    upgrade: true
  entrypoint: []
  cmd: null
 - type: "native"
  id: "native"
 build_info:
  config: "src/annotate/popv/config.vsh.yaml"
  runner: "executable"
  engine: "docker|native"
  output: "target/executable/annotate/popv"
  executable: "target/executable/annotate/popv/popv"
  viash_version: "0.9.0"
  git_commit: "da62b4ffe30b6ef36fcb7ef5944f29d45d1138ff"
  git_remote: "https://x-access-token:ghs_WgbTvvspBKMSQ5BPucz45vMfHrxqK54Vys9e@github.com/openpipelines-bio/openpipeline"
  git_tag: "0.2.0-1939-gda62b4ff"
 package_config:
  name: "openpipeline"
  version: "fix-integration-tests"
  info:
    test_resources:
    - type: "s3"
      path: "s3://openpipelines-data"
      dest: "resources_test"
  viash_version: "0.9.0"
  source: "src"
  target: "target"
  config_mods:
  - ".test_resources += {path: '/src/base/openpipelinetestutils', dest: 'openpipelinetestutils'}\n\
    .resources += {path: '/src/workflows/utils/labels.config', dest: 'nextflow_labels.config'}\n\
    .runners[.type == 'nextflow'].directives.tag := '$id'\n.runners[.type == 'nextflow'].config.script\
    \ := 'includeConfig(\"nextflow_labels.config\")'\n"
  - ".engines += { type: \"native\" }"
  - ".engines[.type == 'docker'].target_registry := 'images.viash-hub.com'"
  - ".engines[.type == 'docker'].target_tag := 'fix-integration-tests'"
  organization: "vsh"
  links:
    repository: "https://github.com/openpipelines-bio/openpipeline"
    docker_registry: "ghcr.io"
    homepage: "https://openpipelines.bio"
    documentation: "https://openpipelines.bio/fundamentals"
    issue_tracker: "https://github.com/openpipelines-bio/openpipeline/issues"
--- a/target/executable/annotate/popv/nextflow_labels.config
+++ b/target/executable/annotate/popv/nextflow_labels.config
@@ -0,0 +1,42 @@
 process {
  // Default resources for components that hardly do any processing
  memory = { 2.GB * task.attempt }
  cpus = 1
  // Retry for exit codes that have something to do with memory issues
  errorStrategy = { task.exitStatus in 137..140 ? 'retry' : 'terminate' }
  maxRetries = 3
  maxMemory = null
  // Resource labels
  withLabel: singlecpu { cpus = 1 }
  withLabel: lowcpu { cpus = 4 }
  withLabel: midcpu { cpus = 10 }
  withLabel: highcpu { cpus = 20 }
  withLabel: lowmem { memory = { get_memory( 4.GB * task.attempt ) } }
  withLabel: midmem { memory = { get_memory( 25.GB * task.attempt ) } }
  withLabel: highmem { memory = { get_memory( 50.GB * task.attempt ) } }
  withLabel: veryhighmem { memory = { get_memory( 75.GB * task.attempt ) } }
 }
 def get_memory(to_compare) {
  if (!process.containsKey("maxMemory") || !process.maxMemory) {
    return to_compare
  }
  try {
    if (process.containsKey("maxRetries") && process.maxRetries && task.attempt == (process.maxRetries as int)) {
      return process.maxMemory
    }
    else if (to_compare.compareTo(process.maxMemory as nextflow.util.MemoryUnit) == 1) {
      return max_memory as nextflow.util.MemoryUnit
    }
    else {
      return to_compare
    }  
  } catch (all) {
        println "Error processing memory resources. Please check that process.maxMemory '${process.maxMemory}' and process.maxRetries '${process.maxRetries}' are valid!"
        System.exit(1)
  }
 }
--- a/Show More
+++ b/Show More