Build branch fix-integration-tests with version fix-integration-tests (da62b4ff)

Build pipeline: vsh-ci-dev-gckj5

Source commit: da62b4ffe3

Source message: Add labels to qc_test component
This commit is contained in:
CI
2024-11-15 14:37:33 +00:00
parent 43cfb251c7
commit bb7533583f
1550 changed files with 913066 additions and 791 deletions

View File

@@ -1,4 +1,4 @@
# openpipelines x.x.x
# openpipelines 2.x.x (Unreleased)
## BREAKING CHANGES
@@ -42,8 +42,6 @@
- Store label probabilities instead of uncertainties
- Take `.h5mu` format as an input instead of `.h5ad`
* `labels_transfer/knn`: delete outdated component due to its functionality now implemented in `labels_transfer/pynndescent_knn`
* `reference/build_cellranger_arc_reference`: a default value of "output" is now specified for the argument `--genome`, inline with `reference/build_cellranger_reference` component. Additionally, providing a value for `--organism` is no longer required and its default value of `Homo Sapiens` has been removed (PR #864).
## NEW FUNCTIONALITY
@@ -91,8 +89,6 @@
* `dataflow/split_h5mu` component: Added a component to split a single h5mu file into multiple h5mu files based on the values of an .obs column (PR #824).
* `labels_transfer/pynndescent_knn`: component: Added a component for KNN classification based on a PyNNDescent neighborhood graph (PR #830).
* `workflows/test_workflows/ingestion` components & `workflows/ingestion`: Added standalone components for integration testing of ingestion workflows (PR #801).
* `workflows/ingestion/make_reference`: Add additional arguments passed through to the STAR and BD Rhapsody reference components (PR #846).
@@ -103,7 +99,7 @@
* `dimred/densmap` component: Added a densMAP dimensionality reduction component (PR #748).
* `annotete/scanvi` component: Added a component to annotate cells using scANVI (PR #833).
* `annotate/scanvi` component: Added a component to annotate cells using scANVI (PR #833).
* `transform/bpcells_regress_out` component: Added a component to regress out effects of confounding variables in the count matrix using BPCells (PR #863).
@@ -129,6 +125,10 @@
* `metadata/duplicate_var` component: Added a component to make a copy from one .var field or index to another .var field within the same MuData object (PR #877).
* `filter/subset_obsp` component: Added a component to subset an .obsp matrix by column based on the value of an .obs field. The resulting subset is moved to an .obsm field (PR #888).
* `labels_transfer/knn` component: Enable using additional distance functions for KNN classification (PR #830) and allow to perform KNN classification based on a pre-calculated neighborhood graph (PR #890).
## MINOR CHANGES
* `resources_test_scripts/cellranger_atac_tiny_bcl.sh` script: generate counts from fastq files using CellRanger atac count (PR #726).
@@ -142,8 +142,6 @@
* Bump scvelo to `0.3.2` (PR #828).
* Bump viash to `0.8.6` (PR #815).
* Pin numpy<2 for several components (PR #815).
* Added `resources_test_scripts/cellranger_atac_tiny_bcl.sh` script: download tiny bcl file with an ATAC experiment, download a motifs file, demultiplex bcl files to reads in fastq format (PR #726).
@@ -162,23 +160,38 @@
## BUG FIXES
* `dataflow/concatenate_h5mu`: fix writing out multidimensional annotation dataframes (e.g. `.varm`) that had their
data dtype (dtype) changed as a result of adding more observations after concatenation, causing `TypeError`.
One notable example of this happening is when one of the samples does not have a multimodal annotation dataframe
which is present in another sample; causing the values being filled with `NA` (PR #837).
* `qc/calculate_qc_metrics`: increase total counts accuracy with low precision floating dtypes as input layer (PR #852).
* Fix failing tests for `ingestion/cellranger_postprocessing`, `ingestion/conversion` and `multiomics/process_batches` (PR #869).
* `convert/from_10xh5_to_h5mu`: add .uns slot to mdata root when metrics file is provided (PR #887).
* Use `params.resources_test` in test workflows in order to point to an alternative location (e.g. a cache).
* Fix ingestion components not working when optional arguments are unset (PR #894).
## DOCUMENTATION
* Update authorship of components (PR #835).
# openpipelines 1.0.3
## BUG FIXES
* `qc/calculate_qc_metrics`: increase total counts accuracy with low precision floating dtypes as input layer (PR # , backported from PR #852).
# openpipelines 1.0.2
## BUG FIXES
* `dataflow/concatenate_h5mu`: fix writing out multidimensional annotation dataframes (e.g. `.varm`) that had their
data dtype (dtype) changed as a result of adding more observations after concatenation, causing `TypeError`.
One notable example of this happening is when one of the samples does not have a multimodal annotation dataframe
which is present in another sample; causing the values being filled with `NA` (PR #842, backported from PR #837).
# openpipelines 1.0.1
## BUG FIXES
* Bump viash to `0.8.6` (PR #816, backported from #815). This changes the at-runtime generated nextflow process from an in-memory to an on-disk temporary file, which should cause less issues with Nextflow Fusion.
# openpipelines 1.0.0-rc6
## BUG FIXES

View File

@@ -23,27 +23,26 @@ argument_groups:
type: string
default: "rna"
required: false
- name: "--var_input_gene_names"
description: .var field containing the gene names, if the .var index is not to be used.
type: string
required: false
- name: Reference
description: Arguments related to the reference dataset.
- name: Reference model
description: Arguments related to the reference model.
arguments:
- name: "--reference"
type: file
description: Reference h5mu file.
direction: input
required: true
example: reference.h5mu
- name: "--scvi_reference_model"
type: file
description: "Pretrained scvi reference model"
description: "Pretrained SCVI reference model to initialize the SCANVI model with. The model needs to include the AnnData object used to trained the model stored. "
example: scvi_model.pt
direction: input
required: true
- name: "--reference_obs_label"
type: string
description: Key in obs field of reference AnnData with cell-type information.
example: "cell_ontology_class"
required: true
required: false
- name: "--scanvi_reference_model"
type: file
description: "Pretrained SCANVI reference model."
example: scvi_model.pt
direction: input
required: false
- name: SCANVI reference model training arguments
description: Arguments related to the reference SCANVI model.
@@ -190,6 +189,7 @@ resources:
- type: python_script
path: script.py
- path: /src/utils/setup_logger.py
- path: /src/annotate/utils/query_reference_allignment.py
test_resources:
- type: python_script

View File

@@ -7,14 +7,39 @@ import numpy as np
par = {
"input": "resources_test/pbmc_1k_protein_v3/pbmc_1k_protein_v3_mms.h5mu",
"modality": "rna",
"reference": "resources_test/annotation_test_data/TS_Blood_filtered.h5ad",
"scvi_reference_model": "resources_test/annotation_test_data/scvi_model.pt",
"reference_obs_label": "cell_ontology_class",
"var_query_gene_names": None,
"scvi_reference_model": "resources_test/annotation_test_data/scvi_model",
"scanvi_reference_model": None,
"unknown_celltype": "Unkown",
"output": "output.h5mu",
"output_obsm_scanvi_embedding": "scanvi_embedding",
"output_obs_predictions": "scanvi_pred",
"output_obs_probability": "scanvi_probability",
"output_model": None,
"output_compression": None,
"reference_learning_rate": 1e-3,
"reference_reduce_lr_on_plateau": True,
"reference_lr_patience": 25,
"reference_lr_factor": 0.5,
"reference_train_size": 0.9,
"reference_max_epochs": 10,
"reference_early_stopping": True,
"reference_early_stopping_patience": 50,
"query_train_size": 0.9,
"query_max_epochs": 10,
"query_learning_rate": 1e-3,
"query_reduce_lr_on_plateau": True,
"query_lr_patience": 25,
"query_lr_factor": 0.5,
"query_early_stopping": True,
"query_early_stopping_patience": 50
}
meta = {}
meta = {"resources_dir": "src/annotate/utils"}
## VIASH END
sys.path.append(meta["resources_dir"])
from query_reference_allignment import set_var_index, cross_check_genes
# START TEMPORARY WORKAROUND setup_logger
# reason: resources aren't available when using Nextflow fusion
# from setup_logger import setup_logger
@@ -33,72 +58,104 @@ def setup_logger():
# END TEMPORARY WORKAROUND setup_logger
logger = setup_logger()
logger.info("Reading the input and reference data")
if (not par["scvi_reference_model"]) and not (par["scanvi_reference_model"]) or (par["scvi_reference_model"] and par["scanvi_reference_model"]):
raise ValueError("Make sure to provide either an '--scvi_reference_model' or a '--scanvi_reference_model', but not both.")
input_data = mu.read_h5mu(par["input"])
query = input_data.mod[par["modality"]]
reference_data = mu.read_h5mu(par["reference"])
reference = reference_data.mod[par["modality"]]
logger.info(f"Loading the pretrained scVI model from {par['scvi_reference_model']}")
scvi_reference_model = scvi.model.SCVI.load(par["scvi_reference_model"], reference)
def main():
logger.info("Reading the query data")
# Read in data
input_data = mu.read_h5mu(par["input"])
input_modality = input_data.mod[par["modality"]].copy()
# scANVI requires query and reference gene names to be equivalent
input_modality = set_var_index(input_modality, par["var_input_gene_names"])
logger.info("Setting up scANVI model")
if par["scanvi_reference_model"]:
scanvi_ref = scvi.model.SCANVI.from_scvi_model(
scvi_reference_model,
unlabeled_category=par["unknown_celltype"],
labels_key=par["reference_obs_label"],
logger.info(f"Loading the pretrained scANVI model from {par['scanvi_reference_model']} and updating it with the query data {par['input']}")
scanvi_query = scvi.model.SCANVI.load_query_data(
input_modality,
par["scanvi_reference_model"],
freeze_classifier=True,
inplace_subset_query_vars=True
)
elif par["scvi_reference_model"]:
logger.info("Reading in the reference model and associated reference data")
scvi_reference_model = scvi.model.SCVI.load(par["scvi_reference_model"])
reference = scvi_reference_model.adata
logger.info("Alligning genes in reference and query dataset")
# scANVI requires query and reference gene names to be equivalent
reference = set_var_index(reference)
# Subset query dataset based on genes present in reference
common_ens_ids = cross_check_genes(input_modality, reference)
input_modality = input_modality[:, common_ens_ids]
logger.info("Instantiating scANVI model from the scVI model")
scanvi_ref = scvi.model.SCANVI.from_scvi_model(
scvi_reference_model,
unlabeled_category=par["unknown_celltype"],
labels_key=scvi_reference_model.adata_manager._registry["setup_args"]["labels_key"],
)
reference_plan_kwargs = {
"lr": par["reference_learning_rate"],
"reduce_lr_on_plateau": par['reference_reduce_lr_on_plateau'],
"lr_patience": par['reference_lr_patience'],
"lr_factor": par['reference_lr_factor']
}
logger.info("Training scANVI model on reference data with celltype labels")
scanvi_ref.train(
train_size=par["reference_train_size"],
max_epochs=par['reference_max_epochs'],
early_stopping=par['reference_early_stopping'],
early_stopping_patience=par['reference_early_stopping_patience'],
plan_kwargs=reference_plan_kwargs,
check_val_every_n_epoch=1,
accelerator="auto"
)
logger.info(f"Updating scANVI model with query data {par['input']}")
scvi.model.SCANVI.prepare_query_anndata(input_modality, scanvi_ref, inplace=True)
scanvi_query = scvi.model.SCANVI.load_query_data(input_modality, scanvi_ref)
logger.info("Training scANVI model with query data")
query_plan_kwargs = {
"lr": par["query_learning_rate"],
"reduce_lr_on_plateau": par['query_reduce_lr_on_plateau'],
"lr_patience": par['query_lr_patience'],
"lr_factor": par['query_lr_factor']
}
scanvi_query.train(
train_size=par["query_train_size"],
max_epochs=par['query_max_epochs'],
early_stopping=par['query_early_stopping'],
early_stopping_patience=par['query_early_stopping_patience'],
plan_kwargs=query_plan_kwargs,
check_val_every_n_epoch=1,
accelerator="auto"
)
reference_plan_kwargs = {"lr": par["reference_learning_rate"],
"reduce_lr_on_plateau": par['reference_reduce_lr_on_plateau'],
"lr_patience": par['reference_lr_patience'],
"lr_factor": par['reference_lr_factor']
}
logger.info("Adding latent representation to query data")
input_modality.obsm[par["output_obsm_scanvi_embedding"]] = scanvi_query.get_latent_representation()
logger.info("Training scANVI model on reference data with celltype labels")
logger.info("Running predictions on query data")
input_modality.obs[par["output_obs_predictions"]] = scanvi_query.predict(input_modality)
input_modality.obs[par["output_obs_probability"]] = np.max(scanvi_query.predict(input_modality, soft=True), axis=1)
scanvi_ref.train(
train_size=par["reference_train_size"],
max_epochs=par['reference_max_epochs'],
early_stopping=par['reference_early_stopping'],
early_stopping_patience=par['reference_early_stopping_patience'],
plan_kwargs=reference_plan_kwargs,
check_val_every_n_epoch=1,
accelerator="auto",
)
logger.info("Saving output and model")
input_data.mod[par["modality"]] = input_modality
input_data.write_h5mu(par["output"], compression=par["output_compression"])
logger.info("Updating and training scANVI model with query data")
scvi.model.SCANVI.prepare_query_anndata(query, scanvi_ref, inplace=True)
scanvi_query = scvi.model.SCANVI.load_query_data(query, scanvi_ref)
if par["output_model"]:
scanvi_query.save(par["output_model"], overwrite=True)
query_plan_kwargs = {"lr": par["query_learning_rate"],
"reduce_lr_on_plateau": par['query_reduce_lr_on_plateau'],
"lr_patience": par['query_lr_patience'],
"lr_factor": par['query_lr_factor']
}
scanvi_query.train(
train_size=par["query_train_size"],
max_epochs=par['query_max_epochs'],
early_stopping=par['query_early_stopping'],
early_stopping_patience=par['query_early_stopping_patience'],
plan_kwargs=query_plan_kwargs,
check_val_every_n_epoch=1,
accelerator="auto",
)
logger.info("Adding latent representation to query data")
query.obsm[par["output_obsm_scanvi_embedding"]] = scanvi_query.get_latent_representation()
logger.info("Running predictions on query data")
query.obs[par["output_obs_predictions"]] = scanvi_query.predict(query)
query.obs[par["output_obs_probability"]] = np.max(scanvi_query.predict(query, soft=True), axis=1)
logger.info("Saving output and model")
input_data.mod[par["modality"]] = query
input_data.write_h5mu(par["output"], compression=par["output_compression"])
if par["output_model"]:
scanvi_query.save(par["output_model"], overwrite=True)
if __name__ == '__main__':
main()

View File

@@ -1,9 +1,9 @@
import subprocess
import sys
import os
import pytest
import re
import mudata as mu
import anndata as ad
from openpipelinetestutils.asserters import assert_annotation_objects_equal
import scvi
import os
@@ -16,6 +16,7 @@ meta = {
input_file = f"{meta['resources_dir']}/pbmc_1k_protein_v3/pbmc_1k_protein_v3_mms.h5mu"
reference_file = f"{meta['resources_dir']}/annotation_test_data/TS_Blood_filtered.h5mu"
@pytest.fixture
def create_scvi_model(random_path, tmp_path):
def wrapper(input_file, reference_file):
@@ -23,7 +24,7 @@ def create_scvi_model(random_path, tmp_path):
input_modality = input_data.mod["rna"]
reference_data = mu.read_h5mu(reference_file)
reference_modality = reference_data.mod["rna"]
reference_data.var["gene_symbol"] = list(reference_data.var.index)
reference_data.var.index = [re.sub("\\.[0-9]+$", "", s) for s in reference_data.var["ensemblid"]]
reference_modality.var["gene_symbol"] = list(reference_modality.var.index)
@@ -47,30 +48,49 @@ def create_scvi_model(random_path, tmp_path):
n_layers=1,
)
scvi_model.train(max_epochs=10)
input_data.mod["rna"] = query
reference_data.mod["rna"] = reference
# reference_data.mod["rna"] = reference
input_data_file = random_path(extension="h5mu")
reference_file = random_path(extension="h5mu")
# reference_file = random_path(extension="h5mu")
scvi_model_file = tmp_path
input_data.write_h5mu(input_data_file)
reference_data.write_h5mu(reference_file)
scvi_model.save(scvi_model_file, overwrite=True)
return scvi_model_file, input_data_file, reference_file
# reference_data.write_h5mu(reference_file)
scvi_model.save(scvi_model_file, save_anndata=True, overwrite=True)
return scvi_model_file, input_data_file
return wrapper
@pytest.fixture
def create_scanvi_model(create_scvi_model, tmp_path):
def scanvi_wrapper():
scvi_model_file, input_data_file = create_scvi_model(input_file, reference_file)
scvi_model = scvi.model.SCVI.load(scvi_model_file)
scanvi_model = scvi.model.SCANVI.from_scvi_model(
scvi_model,
unlabeled_category="Unkown",
labels_key="cell_ontology_class",
)
scanvi_model.train(max_epochs=10)
scanvi_model_file = tmp_path
scanvi_model.save(scanvi_model_file, save_anndata=True, overwrite=True)
return scanvi_model_file, input_data_file
return scanvi_wrapper
def test_simple_execution(run_component, random_h5mu_path, create_scvi_model):
scvi_model_file, input_file_scvi, reference_file_scvi = create_scvi_model(input_file, reference_file)
scvi_model_file, input_file_scvi = create_scvi_model(input_file, reference_file)
output_file = random_h5mu_path()
run_component([
"--input", input_file_scvi,
"--reference", reference_file_scvi,
"--scvi_reference_model", scvi_model_file,
"--reference_obs_label", "cell_ontology_class",
"--reference_max_epochs", "10",
"--query_max_epochs", "10",
"--output", output_file
@@ -80,7 +100,7 @@ def test_simple_execution(run_component, random_h5mu_path, create_scvi_model):
input_mudata = mu.read_h5mu(input_file_scvi)
output_mudata = mu.read_h5mu(output_file)
assert input_mudata.mod["rna"].n_obs == output_mudata.mod["rna"].n_obs, f"Number of observations changed"
assert input_mudata.mod["rna"].n_vars == output_mudata.mod["rna"].n_vars, f"Number of variables changed"
assert "scanvi_embedding" in output_mudata.mod["rna"].obsm.keys(), "Latent representation not added"
@@ -89,16 +109,15 @@ def test_simple_execution(run_component, random_h5mu_path, create_scvi_model):
assert_annotation_objects_equal(input_mudata.mod["prot"],
output_mudata.mod["prot"])
def test_multiple_arguments(run_component, random_h5mu_path, create_scvi_model, tmp_path):
scvi_model_file, input_file_scvi, reference_file_scvi = create_scvi_model(input_file, reference_file)
scvi_model_file, input_file_scvi = create_scvi_model(input_file, reference_file)
output_file = random_h5mu_path()
run_component([
"--input", input_file_scvi,
"--reference", reference_file_scvi,
"--scvi_reference_model", scvi_model_file,
"--reference_obs_label", "cell_ontology_class",
"--output", output_file,
"--reference_max_epochs", "10",
"--reference_reduce_lr_on_plateau", "True",
@@ -123,20 +142,69 @@ def test_multiple_arguments(run_component, random_h5mu_path, create_scvi_model,
])
assert os.path.exists(output_file), "Output file does not exist"
assert os.path.exists(tmp_path / "model.pt"), "Model file does not exist"
assert os.path.exists(tmp_path / "model.pt"), "Model file does not exist"
input_mudata = mu.read_h5mu(input_file_scvi)
output_mudata = mu.read_h5mu(output_file)
assert input_mudata.mod["rna"].n_obs == output_mudata.mod["rna"].n_obs, f"Number of observations changed"
assert input_mudata.mod["rna"].n_vars == output_mudata.mod["rna"].n_vars, f"Number of variables changed"
assert "scanvi_embedding" in output_mudata.mod["rna"].obsm.keys(), "Latent representation not added"
assert "scanvi_pred" in output_mudata.mod["rna"].obs.keys(), "Predictions not added"
assert "scanvi_probability" in output_mudata.mod["rna"].obs.keys(), "Probabilities not added"
assert_annotation_objects_equal(input_mudata.mod["prot"],
output_mudata.mod["prot"])
def test_pretrained_scanvi(run_component, random_h5mu_path, create_scanvi_model):
scanvi_model_file, input_file_scanvi = create_scanvi_model()
output_file = random_h5mu_path()
run_component([
"--input", input_file_scanvi,
"--scanvi_reference_model", scanvi_model_file,
"--reference_obs_label", "cell_ontology_class",
"--reference_max_epochs", "10",
"--query_max_epochs", "10",
"--output", output_file
])
assert os.path.exists(output_file), "Output file does not exist"
input_mudata = mu.read_h5mu(input_file_scanvi)
output_mudata = mu.read_h5mu(output_file)
assert input_mudata.mod["rna"].n_obs == output_mudata.mod["rna"].n_obs, f"Number of observations changed"
assert input_mudata.mod["rna"].n_vars == output_mudata.mod["rna"].n_vars, f"Number of variables changed"
assert "scanvi_embedding" in output_mudata.mod["rna"].obsm.keys(), "Latent representation not added"
assert "scanvi_pred" in output_mudata.mod["rna"].obs.keys(), "Predictions not added"
assert "scanvi_probability" in output_mudata.mod["rna"].obs.keys(), "Probabilities not added"
assert_annotation_objects_equal(input_mudata.mod["prot"],
output_mudata.mod["prot"])
def test_raises(run_component, random_h5mu_path, create_scvi_model, create_scanvi_model):
scvi_model_file, input_file_scvi = create_scvi_model(input_file, reference_file)
scanvi_model_file, input_file_scanvi = create_scanvi_model()
output_file = random_h5mu_path()
with pytest.raises(subprocess.CalledProcessError) as err:
run_component([
"--input", input_file_scanvi,
"--scanvi_reference_model", scanvi_model_file,
"--scvi_reference_model", scvi_model_file,
"--reference_obs_label", "cell_ontology_class",
"--reference_max_epochs", "10",
"--query_max_epochs", "10",
"--output", output_file
])
assert re.search(
r"ValueError: Make sure to provide either an '--scvi_reference_model' or a '--scanvi_reference_model', but not both.",
err.value.stdout.decode('utf-8')
)
if __name__ == '__main__':
sys.exit(pytest.main([__file__]))
sys.exit(pytest.main([__file__]))

View File

@@ -0,0 +1,46 @@
import re
import anndata as ad
def setup_logger():
import logging
from sys import stdout
logger = logging.getLogger()
logger.setLevel(logging.INFO)
console_handler = logging.StreamHandler(stdout)
logFormatter = logging.Formatter("%(asctime)s %(levelname)-8s %(message)s")
console_handler.setFormatter(logFormatter)
logger.addHandler(console_handler)
return logger
# END TEMPORARY WORKAROUND setup_logger
logger = setup_logger()
# Helper functions
def set_var_index(adata: ad.AnnData, var_name: str | None = None):
if var_name:
adata.var.index = [re.sub("\\.[0-9]+$", "", s) for s in adata.var[var_name]]
else:
adata.var.index = [re.sub("\\.[0-9]+$", "", s) for s in adata.var.index]
return adata
def cross_check_genes(query: ad.AnnData, reference: ad.AnnData):
logger.info("Detecting common vars based on gene ids")
common_ens_ids = list(set(reference.var.index).intersection(set(query.var.index)))
logger.info(" reference n_vars: %i", reference.n_vars)
logger.info(" input n_vars: %i", query.n_vars)
logger.info(" intersect n_vars: %i", len(common_ens_ids))
assert len(common_ens_ids) >= 100, "The intersection of genes between the query and reference dataset is too small."
return common_ens_ids
def subset_vars(adata: ad.AnnData, var_column: str | None = None):
if var_column:
return adata[:, adata.var[var_column]]
else:
return adata

View File

@@ -0,0 +1,75 @@
name: subset_obsp
namespace: "filter"
description: |
Create a subset of an .obsp field in a mudata file, by filtering the columns based on the values of an .obs column. The resulting subset is moved to an .obsm slot.
authors:
- __merge__: /src/authors/dorien_roosen.yaml
roles: [ author, maintainer ]
argument_groups:
- name: Input
arguments:
- name: "--input"
type: file
description: Input h5mu file
direction: input
required: true
example: input.h5mu
- name: "--modality"
type: string
default: "rna"
required: false
- name: "--input_obsp_key"
type: string
required: true
description: The .obsp field to be filtered.
- name: "--input_obs_key"
type: string
required: true
description: The .obs column to filter on.
- name: "--input_obs_value"
type: string
required: true
description: The value to filter on in the .obs column.
- name: Output
arguments:
- name: "--output"
type: file
description: Output h5mu file.
direction: output
example: output.h5mu
- name: "--output_obsm_key"
type: string
required: true
description: The .obsm key to store the subset in.
- name: "--output_compression"
type: string
description: The compression format to be used on the output h5mu object.
choices: ["gzip", "lzf"]
required: false
example: "gzip"
resources:
- type: python_script
path: script.py
- path: /src/utils/setup_logger.py
test_resources:
- type: python_script
path: test.py
- path: /resources_test/pbmc_1k_protein_v3/pbmc_1k_protein_v3_mms.h5mu
engines:
- type: docker
image: python:3.12-slim
setup:
- type: apt
packages:
- procps
- type: python
__merge__: /src/base/requirements/anndata_mudata.yaml
__merge__: [ /src/base/requirements/python_test_setup.yaml, .]
runners:
- type: executable
- type: nextflow
directives:
label: [singlecpu, lowmem]

View File

@@ -0,0 +1,54 @@
import mudata as mu
### VIASH START
par = {
'input': 'resources_test/pbmc_1k_protein_v3/pbmc_1k_protein_v3_mms.h5mu',
'modality': 'rna',
'input_obsp_key': 'distances',
'input_obs_key': 'leiden',
'input_obs_value': '1',
'output_obsm_key': "leiden_1",
'output': 'subset_obsp_output.h5mu',
'output_compression': None,
}
### VIASH END
# START TEMPORARY WORKAROUND setup_logger
# reason: resources aren't available when using Nextflow fusion
# from setup_logger import setup_logger
def setup_logger():
import logging
from sys import stdout
logger = logging.getLogger()
logger.setLevel(logging.INFO)
console_handler = logging.StreamHandler(stdout)
logFormatter = logging.Formatter("%(asctime)s %(levelname)-8s %(message)s")
console_handler.setFormatter(logFormatter)
logger.addHandler(console_handler)
return logger
# END TEMPORARY WORKAROUND setup_logger
logger = setup_logger()
def main():
logger.info(f"Reading {par['input']}")
mdata = mu.read_h5mu(par["input"])
adata = mdata.mod[par["modality"]]
logger.info(f"Subset columns of obsp matrix under {par['input_obsp_key']} based on {par['input_obs_key']} == {par['input_obs_value']}")
# .obsp, .obs and .obsm index and .obsp columns all have a dimension length of `n_obs`
# the index dimensions remain unaltered, but .obsp columns will be subset
obsp = adata.obsp[par["input_obsp_key"]]
idx = adata.obs[par["input_obs_key"]].astype(str) == par["input_obs_value"]
obsm_subset = obsp[:, idx]
logger.info(f"Writing subset obsp matrix to .obsm {par['output_obsm_key']}")
adata.obsm[par["output_obsm_key"]] = obsm_subset
logger.info(f"Writing output to {par['output']}")
mdata.write_h5mu(par["output"], compression=par["output_compression"])
if __name__ == '__main__':
main()

View File

@@ -0,0 +1,48 @@
import sys
import pytest
import mudata as mu
## VIASH START
meta = {
'resources_dir': 'resources_test/pbmc_1k_protein_v3/'
}
## VIASH END
@pytest.fixture
def input_h5mu():
input = mu.read_h5mu(f"{meta['resources_dir']}/pbmc_1k_protein_v3_mms.h5mu")
input.mod["rna"].obs["filter_column"] = "group_2"
input.mod["rna"].obs["filter_column"][:50] = "group_1"
return input
@pytest.fixture
def input_path(write_mudata_to_file, input_h5mu):
return write_mudata_to_file(input_h5mu)
def test_subset_obsp(input_path, run_component, tmp_path):
output_path = tmp_path / "output.h5mu"
# run component
run_component([
"--input", input_path,
"--output", str(output_path),
"--input_obsp_key", "distances",
"--input_obs_key", "filter_column",
"--input_obs_value", "group_1",
"--output_obsm_key", "group_1"
])
assert output_path.is_file(), "Output file not found"
# check output file
mu_out = mu.read_h5mu(output_path)
assert "group_1" in mu_out.mod["rna"].obsm, "Output should contain group_1 in .obsm"
assert mu_out.mod["rna"].obsm["group_1"].shape[1] == 50, "Obsm should only contain a subset of the original obsp matrix"
if __name__ == "__main__":
sys.exit(pytest.main([__file__]))

View File

@@ -1,7 +1,7 @@
name: pynndescent_knn
name: knn
namespace: "labels_transfer"
description: |
This component generates a neighborhood graph based using the PyNNDescentTransformer, followed by classification using a k-nearest neighborhood vote.
This component performs label transfer from reference to query using a K-Neirest Neighbors classifier.
authors:
- __merge__: /src/authors/dorien_roosen.yaml
roles: [ maintainer, author ]
@@ -11,6 +11,27 @@ authors:
__merge__: ../api/common_arguments.yaml
argument_groups:
- name: Input dataset (query) arguments
arguments:
- name: "--input_obsm_distances"
type: string
direction: input
required: false
example: bbknn_distances
description: |
The `.obsm` key of the input (query) dataset containing pre-calculated distances.
If not provided, the distances will be calculated using PyNNDescent.
Make sure the distance matrix contains distances relative to the reference dataset and were obtained in the same way as the reference embedding.
- name: Reference dataset arguments
arguments:
- name: "--reference_obsm_distances"
type: string
required: false
description: |
The `.obsm` key of the reference dataset containing pre-calculated distances.
If not provided, the distances will be calculated using PyNNDescent.
example: bbknn_distances
- name: KNN label transfer arguments
arguments:
@@ -30,6 +51,7 @@ argument_groups:
description: |
The number of neighbors to use in k-neighbor graph structure used for fast approximate nearest neighbor search with PyNNDescent.
Larger values will result in more accurate search results at the cost of computation time.
resources:
- type: python_script
path: script.py
@@ -56,9 +78,7 @@ engines:
packages:
- pynndescent~=0.5.10
- numpy<2
test_setup:
- type: python
__merge__: [ /src/base/requirements/viashpy.yaml ]
__merge__: [ /src/base/requirements/python_test_setup.yaml, .]
runners:
- type: executable

View File

@@ -77,20 +77,46 @@ r_adata = r_mdata.mod[par["modality"]]
logger.info("Checking arguments")
par = check_arguments(par)
# Generating training and inference data
logger.info("Generating training and inference data")
train_X = get_reference_features(r_adata, par, logger)
inference_X = get_query_features(q_adata, par, logger)
if par["input_obsm_distances"] and par["reference_obsm_distances"]:
logger.info("Using pre-calculated distances for KNN classification as provided in `--input_obsm_distances` and `--reference_obsm_distances`.")
neighbors_transformer = PyNNDescentTransformer(
n_neighbors=par["n_neighbors"],
parallel_batch_queries=True,
)
neighbors_transformer.fit(train_X)
assert par["input_obsm_distances"] in q_adata.obsm, f"Make sure --input_obsm_distances {par['input_obsm_distances']} is a valid .obsm key. Found: {q_adata.obsm.keys()}."
assert par["reference_obsm_distances"] in r_adata.obsm, f"Make sure --reference_obsm_distances {par['reference_obsm_distances']} is a valid .obsm key. Found: {r_adata.obsm.keys()}."
# Square sparse matrix with distances to n neighbors in reference data
reference_neighbors = neighbors_transformer.transform(inference_X)
query_neighbors = neighbors_transformer.transform(train_X)
query_neighbors = q_adata.obsm[par["input_obsm_distances"]]
reference_neighbors = r_adata.obsm[par["reference_obsm_distances"]]
if query_neighbors.shape[1] != reference_neighbors.shape[1]:
raise ValueError("The number of neighbors in the query and reference distance matrices do not match. Make sure both distance matrices contain distances to the reference dataset.")
# Make sure the number of neighbors present in the distance matrix matches the requested number of neighbors in --n_neighbors
# Otherwise reduce n_neighbors for KNN
smallest_neighbor_count = min(
np.diff(query_neighbors.indptr).min(),
np.diff(reference_neighbors.indptr).min()
)
if smallest_neighbor_count < par["n_neighbors"]:
logger.warning(f"The number of neighbors in the distance matrices is smaller than the requested number of neighbors in --n_neighbors. Reducing n_neighbors to {smallest_neighbor_count} for KNN Classification")
par["n_neighbors"] = smallest_neighbor_count
elif par["input_obsm_distances"] or par["reference_obsm_distances"]:
raise ValueError("Make sure to provide both --input_obsm_distances and --reference_obsm_distances if you want to use a pre-calculated distance matrix for KNN classification.")
elif not par["input_obsm_distances"] and not par["reference_obsm_distances"]:
logger.info("No pre-calculated distances were provided. Calculating distances using the PyNNDescent algorithm.")
# Generating training and inference data
train_X = get_reference_features(r_adata, par, logger)
inference_X = get_query_features(q_adata, par, logger)
neighbors_transformer = PyNNDescentTransformer(
n_neighbors=par["n_neighbors"],
parallel_batch_queries=True,
)
neighbors_transformer.fit(train_X)
# Square sparse matrix with distances to n neighbors in reference data
query_neighbors = neighbors_transformer.transform(inference_X)
reference_neighbors = neighbors_transformer.transform(train_X)
# For each target, train a classifier and predict labels
for obs_tar, obs_pred, obs_proba in zip(par["reference_obs_targets"], par["output_obs_predictions"], par["output_obs_probability"]):
@@ -104,10 +130,14 @@ for obs_tar, obs_pred, obs_proba in zip(par["reference_obs_targets"], par["outp
logger.info(f"Using KNN classifier with {par['weights']} weights")
train_y = r_adata.obs[obs_tar].to_numpy()
classifier = KNeighborsClassifier(n_neighbors=par["n_neighbors"], metric="precomputed", weights=weights_dict[par["weights"]])
classifier.fit(X=query_neighbors, y=train_y)
predicted_labels = classifier.predict(reference_neighbors)
probabilities = classifier.predict_proba(reference_neighbors).max(axis=1)
classifier = KNeighborsClassifier(
n_neighbors=par["n_neighbors"],
metric="precomputed",
weights=weights_dict[par["weights"]]
)
classifier.fit(X=reference_neighbors, y=train_y)
predicted_labels = classifier.predict(query_neighbors)
probabilities = classifier.predict_proba(query_neighbors).max(axis=1)
# save_results
logger.info(f"Saving predictions to {obs_pred} and probabilities to {obs_proba} in obs")

View File

@@ -0,0 +1,155 @@
import re
import subprocess
import pytest
from pathlib import Path
import anndata as ad
import mudata as mu
import numpy as np
from scipy.sparse import csr_matrix
## VIASH START
meta = {
'resources_dir': './resources_test/'
}
## VIASH END
reference_h5ad_file = f"{meta['resources_dir']}/annotation_test_data/TS_Blood_filtered.h5ad"
# convert reference to h5mu
reference_adata = ad.read_h5ad(reference_h5ad_file)
reference_mdata = mu.MuData({"rna": reference_adata})
reference_file = f"{meta['resources_dir']}/annotation_test_data/TS_Blood_filtered.h5mu"
reference_mdata.write_h5mu(reference_file)
input_file = f"{meta['resources_dir']}/pbmc_1k_protein_v3/pbmc_1k_protein_v3_filtered_feature_bc_matrix.h5mu"
def test_label_transfer(run_component, random_h5mu_path):
output = random_h5mu_path()
args = [
"--input", input_file,
"--modality", "rna",
"--reference", reference_file,
"--reference_obs_targets", "cell_type",
"--output", output,
"--n_neighbors", "5"
]
run_component(args)
assert Path(output).is_file()
output_data = mu.read_h5mu(output)
assert "cell_type_pred" in output_data.mod["rna"].obs, f"Predictions cell_type_pred is missing from output\noutput: {output_data.mod['rna'].obs}"
assert "cell_type_probability" in output_data.mod["rna"].obs, f"Uncertainties cell_type_probability is missing from output\noutput: {output_data.mod['rna'].obs}"
@pytest.mark.parametrize("weights", ["uniform", "distance", "gaussian"])
def test_label_transfer_prediction_columns(run_component, weights, random_h5mu_path):
output = random_h5mu_path()
args = [
"--input", input_file,
"--modality", "rna",
"--reference", reference_file,
"--reference_obs_targets", "cell_type",
"--weights", weights,
"--output", output,
"--output_obs_probability", "test_probability",
"--output_obs_predictions", "test_prediction",
"--n_neighbors", "5"
]
run_component(args)
assert Path(output).is_file()
output_data = mu.read_h5mu(output)
assert "test_prediction" in output_data.mod["rna"].obs, f"Predictions test_prediction is missing from output\noutput: {output_data.mod['rna'].obs}"
assert "test_probability" in output_data.mod["rna"].obs, f"Uncertainties test_probability is missing from output\noutput: {output_data.mod['rna'].obs}"
def test_label_transfer_prediction_precomputed_neighbor_graph(run_component, random_h5mu_path):
output = random_h5mu_path()
# Add mock distance matrix to obsm slot
reference_mdata = mu.read_h5mu(reference_file)
ref_distances = np.random.rand(400, 400)
ref_distances[ref_distances < 0.5] = 0
ref_distances = csr_matrix(ref_distances)
reference_mdata.mod["rna"].obsm["distances"] = ref_distances
reference_mdata.write_h5mu(reference_file)
query_mdata = mu.read_h5mu(input_file)
query_distances = np.random.rand(713, 400)
query_distances[query_distances < 0.5] = 0
query_distances = csr_matrix(query_distances)
query_mdata.mod["rna"].obsm["distances"] = query_distances
query_mdata.write_h5mu(input_file)
args = [
"--input", input_file,
"--modality", "rna",
"--reference", reference_file,
"--reference_obs_targets", "cell_type",
"--output", output,
"--input_obsm_distances", "distances",
"--reference_obsm_distances", "distances",
"--output_obs_probability", "test_probability",
"--output_obs_predictions", "test_prediction",
"--n_neighbors", "5"
]
run_component(args)
assert Path(output).is_file()
output_data = mu.read_h5mu(output)
assert "test_prediction" in output_data.mod["rna"].obs, f"Predictions test_prediction is missing from output\noutput: {output_data.mod['rna'].obs}"
assert "test_probability" in output_data.mod["rna"].obs, f"Uncertainties test_probability is missing from output\noutput: {output_data.mod['rna'].obs}"
def test_raises_distance_matrix_dimensions(run_component, random_h5mu_path):
output = random_h5mu_path()
reference_mdata = mu.read_h5mu(reference_file)
ref_distances = np.random.rand(400, 100)
ref_distances[ref_distances < 0.5] = 0
ref_distances = csr_matrix(ref_distances)
reference_mdata.mod["rna"].obsm["distances"] = ref_distances
reference_mdata.write_h5mu(reference_file)
query_mdata = mu.read_h5mu(input_file)
query_distances = np.random.rand(713, 400)
query_distances[query_distances < 0.5] = 0
query_distances = csr_matrix(query_distances)
query_mdata.mod["rna"].obsm["distances"] = query_distances
query_mdata.write_h5mu(input_file)
with pytest.raises(subprocess.CalledProcessError) as err:
run_component([
"--input", input_file,
"--modality", "rna",
"--reference", reference_file,
"--reference_obs_targets", "cell_type",
"--output", output,
"--input_obsm_distances", "distances",
"--reference_obsm_distances", "distances",
"--output_obs_probability", "test_probability",
"--output_obs_predictions", "test_prediction",
"--n_neighbors", "5"
])
assert re.search(
r"ValueError: The number of neighbors in the query and reference distance matrices do not match. Make sure both distance matrices contain distances to the reference dataset.",
err.value.stdout.decode('utf-8')
)
if __name__ == '__main__':
exit(pytest.main([__file__]))

View File

@@ -1,70 +0,0 @@
import pytest
from pathlib import Path
import anndata as ad
import mudata as mu
## VIASH START
meta = {
'resources_dir': './resources_test/'
}
## VIASH END
reference_h5ad_file = f"{meta['resources_dir']}/annotation_test_data/TS_Blood_filtered.h5ad"
# convert reference to h5mu
reference_adata = ad.read_h5ad(reference_h5ad_file)
reference_mdata = mu.MuData({"rna": reference_adata})
reference_file = f"{meta['resources_dir']}/annotation_test_data/TS_Blood_filtered.h5mu"
reference_mdata.write_h5mu(reference_file)
input_file = f"{meta['resources_dir']}/pbmc_1k_protein_v3/pbmc_1k_protein_v3_filtered_feature_bc_matrix.h5mu"
def test_label_transfer(run_component):
args = [
"--input", input_file,
"--modality", "rna",
"--reference", reference_file,
"--reference_obs_targets", "cell_type",
"--output", "output.h5mu",
"--n_neighbors", "5"
]
run_component(args)
assert Path("output.h5mu").is_file()
output_data = mu.read_h5mu("output.h5mu")
assert "cell_type_pred" in output_data.mod["rna"].obs, f"Predictions cell_type_pred is missing from output\noutput: {output_data.mod['rna'].obs}"
assert "cell_type_probability" in output_data.mod["rna"].obs, f"Uncertainties cell_type_probability is missing from output\noutput: {output_data.mod['rna'].obs}"
@pytest.mark.parametrize("weights", ["uniform", "distance", "gaussian"])
def test_label_transfer_prediction_columns(run_component, weights):
output = f"output_{weights}.h5mu"
args = [
"--input", input_file,
"--modality", "rna",
"--reference", reference_file,
"--reference_obs_targets", "cell_type",
"--weights", weights,
"--output", output,
"--output_obs_probability", "test_probability",
"--output_obs_predictions", "test_prediction",
"--n_neighbors", "5"
]
run_component(args)
assert Path(output).is_file()
output_data = mu.read_h5mu(output)
assert "test_prediction" in output_data.mod["rna"].obs, f"Predictions test_prediction is missing from output\noutput: {output_data.mod['rna'].obs}"
assert "test_probability" in output_data.mod["rna"].obs, f"Uncertainties test_probability is missing from output\noutput: {output_data.mod['rna'].obs}"
if __name__ == '__main__':
exit(pytest.main([__file__]))

View File

@@ -114,7 +114,7 @@ resources:
- path: make_rhap_reference_2.2.1_nodocker.cwl
test_resources:
- type: bash_script
path: run_test.sh
path: test.sh
- path: /resources_test/reference_gencodev41_chr1/reference.fa.gz
- path: /resources_test/reference_gencodev41_chr1/reference.gtf.gz

View File

@@ -1,6 +1,6 @@
#!/bin/bash
set -eou pipefail
## VIASH START
meta_executable="bin/viash run src/reference/make_reference/config.vsh.yaml --"

View File

@@ -53,7 +53,7 @@ resources:
path: script.sh
test_resources:
- type: bash_script
path: run_test.sh
path: test.sh
- path: /resources_test/reference_gencodev41_chr1
engines:
- type: docker

View File

@@ -1,6 +1,6 @@
#!/bin/bash
set -eou pipefail
set -eo pipefail
## VIASH START
par_genome_fasta="resources_test/reference_gencodev41_chr1/reference.fa.gz"

View File

@@ -33,7 +33,7 @@ resources:
path: script.sh
test_resources:
- type: bash_script
path: run_test.sh
path: test.sh
- path: /resources_test/reference_gencodev41_chr1
engines:

View File

@@ -1,6 +1,6 @@
#!/bin/bash
set -eou pipefail
set -eo pipefail
## VIASH START
par_genome_fasta="resources_test/reference_gencodev41_chr1/reference.fa.gz"

View File

@@ -1,4 +1,6 @@
# set -eo pipefail
#!/bin/bash
set -eou pipefail
## VIASH START
meta_resources_dir="./resources_test"

View File

@@ -27,8 +27,6 @@ resources:
- type: bash_script
path: script.sh
test_resources:
# - type: bash_script
# path: run_test.sh
- type: python_script
path: test.py
- path: /resources_test/reference_gencodev41_chr1

View File

@@ -1,46 +0,0 @@
#!/bin/bash
set -eou pipefail
## VIASH START
meta_executable="bin/viash run src/reference/cellranger_mkgtf/config.vsh.yaml --"
## VIASH END
# create temporary directory
tmpdir=$(mktemp -d "$meta_temp_dir/$meta_name-XXXXXXXX")
function clean_up {
rm -rf "$tmpdir"
}
trap clean_up EXIT
zcat "$meta_resources_dir/reference_gencodev41_chr1/reference.gtf.gz" | awk '$4 < 50001 {print ;}' | gzip > "$tmpdir/reference_small.gtf.gz"
expected_gene_types=("transcribed_unprocessed_pseudogene" "miRNA")
attribute_values=$(printf 'gene_type:%s,' "${expected_gene_types[@]}")
attribute_values=${attribute_values%,} # remove trailing comma
echo $attribute_values
echo "> Running $meta_name, writing to $tmpdir."
$meta_executable \
--input_gtf "$tmpdir/reference_small.gtf.gz" \
--output_gtf "$tmpdir/myreference_filtered.gtf.gz" \
--attribute "$attribute_values" \
---cpus ${meta_memory_gb:-1} \
---memory ${meta_memory_gb:-2}GB
exit_code=$?
[[ $exit_code != 0 ]] && echo "Non zero exit code: $exit_code" && exit 1
echo ">> Checking whether output can be found"
[[ ! -f "$tmpdir/myreference_filtered.gtf.gz" ]] && echo "Output gtf file could not be found!" && exit 1
echo ">> Checking attribute 'gene_type' in output gtf file"
unique_gene_types=$(zcat "$tmpdir/myreference_filtered.gtf.gz" | awk -F'\t' '$9 ~ /gene_type/ { split($9, a, ";"); for(i in a) if(a[i] ~ /gene_type/) print a[i] }' | sed 's/.*gene_type "\(.*\)".*/\1/' | sort -u)
echo "Expected gene types: ${expected_gene_types[@]}"
echo "Unique gene types: $unique_gene_types"
if [[ "${#expected_gene_types[@]}" != "$(echo "$unique_gene_types" | wc -w)" ]]; then
echo "Error: Not all expected gene types were found in the output gtf file"
exit 1
fi
echo "> Test succeeded!"

View File

@@ -1,6 +1,6 @@
#!/bin/bash
set -eou pipefail
set -eo pipefail
## VIASH START
par_input_gtf="resources_test/reference_gencodev41_chr1/reference.gtf.gz"

View File

@@ -49,7 +49,7 @@ resources:
path: script.sh
test_resources:
- type: bash_script
path: run_test.sh
path: test.sh
engines:
- type: docker
image: ubuntu:22.04

View File

@@ -1,42 +0,0 @@
#!/bin/bash
set -eou pipefail
## VIASH START
meta_executable="bin/viash run src/reference/make_reference/config.vsh.yaml --"
## VIASH END
echo "> Running $meta_name."
fasta="myreference.fa.gz"
gtf="myreference.gtf.gz"
wget https://ftp.ensembl.org/pub/release-109/fasta/homo_sapiens/dna/Homo_sapiens.GRCh38.dna.chromosome.1.fa.gz
wget https://ftp.ensembl.org/pub/release-109/gtf/homo_sapiens/Homo_sapiens.GRCh38.109.chr.gtf.gz
wget https://assets.thermofisher.com/TFS-Assets/LSG/manuals/ERCC92.zip
$meta_executable \
--genome_fasta "Homo_sapiens.GRCh38.dna.chromosome.1.fa.gz" \
--transcriptome_gtf "Homo_sapiens.GRCh38.109.chr.gtf.gz" \
--ercc "ERCC92.zip" \
--subset_regex "(ERCC-00002|1)" \
--output_fasta $fasta \
--output_gtf $gtf
exit_code=$?
[[ $exit_code != 0 ]] && echo "Non zero exit code: $exit_code" && exit 1
echo ">> Checking whether output can be found"
[[ ! -f $fasta ]] && echo "Output fasta file could not be found!" && exit 1
[[ ! -f $gtf ]] && echo "Output gtf file could not be found!" && exit 1
echo ">> Checking contents of fasta"
if ! zgrep -q '>1' $fasta; then
echo "Could not find chromosome '1' in output reference!"
exit 1
fi
if ! zgrep -q '>ERCC-00002' $fasta; then
echo "Could not find ERCC-00002 in output reference!"
exit 1
fi
echo "> Test succeeded!"

View File

@@ -1,6 +1,6 @@
#!/bin/bash
set -eou pipefail
set -eo pipefail
## VIASH START
par_genome_fasta="https://ftp.ebi.ac.uk/pub/databases/gencode/Gencode_human/release_41/GRCh38.primary_assembly.genome.fa.gz"

View File

@@ -0,0 +1,80 @@
#!/bin/bash
set -eo pipefail
## VIASH START
meta_executable="bin/viash run src/reference/make_reference/config.vsh.yaml --"
## VIASH END
# Fetch test data
echo ">> Fetching test data"
wget https://ftp.ensembl.org/pub/release-109/fasta/homo_sapiens/dna/Homo_sapiens.GRCh38.dna.chromosome.1.fa.gz
wget https://ftp.ensembl.org/pub/release-109/gtf/homo_sapiens/Homo_sapiens.GRCh38.109.chr.gtf.gz
wget https://assets.thermofisher.com/TFS-Assets/LSG/manuals/ERCC92.zip
# Test 1
echo ">> Test1"
mkdir test1
pushd test1
fasta="myreference.fa.gz"
gtf="myreference.gtf.gz"
"$meta_executable" \
--genome_fasta "../Homo_sapiens.GRCh38.dna.chromosome.1.fa.gz" \
--transcriptome_gtf "../Homo_sapiens.GRCh38.109.chr.gtf.gz" \
--ercc "../ERCC92.zip" \
--subset_regex "(ERCC-00002|1)" \
--output_fasta $fasta \
--output_gtf $gtf
exit_code=$?
[[ $exit_code != 0 ]] && echo "Non zero exit code: $exit_code" && exit 1
echo ">> Checking whether output can be found"
[[ ! -f $fasta ]] && echo "Output fasta file could not be found!" && exit 1
[[ ! -f $gtf ]] && echo "Output gtf file could not be found!" && exit 1
echo ">> Checking contents of fasta"
if ! zgrep -q '>1' $fasta; then
echo "Could not find chromosome '1' in output reference!"
exit 1
fi
if ! zgrep -q '>ERCC-00002' $fasta; then
echo "Could not find ERCC-00002 in output reference!"
exit 1
fi
popd
# Test 2
echo ">> Test 2"
mkdir test2
pushd test2
fasta="myreference.fa.gz"
gtf="myreference.gtf.gz"
"$meta_executable" \
--genome_fasta "../Homo_sapiens.GRCh38.dna.chromosome.1.fa.gz" \
--transcriptome_gtf "../Homo_sapiens.GRCh38.109.chr.gtf.gz" \
--output_fasta $fasta \
--output_gtf $gtf
exit_code=$?
[[ $exit_code != 0 ]] && echo "Non zero exit code: $exit_code" && exit 1
echo ">> Checking whether output can be found"
[[ ! -f $fasta ]] && echo "Output fasta file could not be found!" && exit 1
[[ ! -f $gtf ]] && echo "Output gtf file could not be found!" && exit 1
echo ">> Checking contents of fasta"
if ! zgrep -q '>1' $fasta; then
echo "Could not find chromosome '1' in output reference!"
exit 1
fi
if zgrep -q '>ERCC-00002' $fasta; then
echo "Should not find ERCC-00002 in output reference!"
exit 1
fi
popd
echo "> Test succeeded!"

View File

@@ -1,22 +1,16 @@
#!/bin/bash
# get the root of the directory
REPO_ROOT=$(git rev-parse --show-toplevel)
# ensure that the command below is run from the root of the repository
cd "$REPO_ROOT"
export NXF_VER=21.10.6
viash ns build -q gdo_singlesample
nextflow run . \
nextflow \
run . \
-main-script src/workflows/gdo/gdo_singlesample/test.nf \
-profile docker,no_publish \
-entry test_wf \
-with-trace work/trace.txt \
-profile docker,no_publish \
-c src/workflows/utils/labels_ci.config \
-c src/workflows/utils/integration_tests.config

View File

@@ -43,8 +43,7 @@ workflow run_wf {
]
return newState
},
toState: ["output": "output"],
auto: [ publish: true ]
toState: ["output": "output"]
)
| setState(["output"])
emit:

View File

@@ -5,13 +5,13 @@ include { gdo_singlesample } from params.rootDir + "/target/nextflow/workflows/g
params.resources_test = params.rootDir + "/resources_test"
workflow test_wf {
// allow changing the resources_test dir
resources_test = file("${params.rootDir}/resources_test")
resources_test = file(params.resources_test)
output_ch = Channel.fromList([
[
id: "simple_execution_test",
input: file(params.resources_test).resolve("10x_5k_lung_crispr/SC3_v3_NextGem_DI_CRISPR_A549_5K.h5mu"),
input: resources_test.resolve("10x_5k_lung_crispr/SC3_v3_NextGem_DI_CRISPR_A549_5K.h5mu"),
min_counts: 3,
max_counts: 10000000,
min_guides_per_cell: 2,

View File

@@ -1,22 +1,16 @@
#!/bin/bash
# get the root of the directory
REPO_ROOT=$(git rev-parse --show-toplevel)
# ensure that the command below is run from the root of the repository
cd "$REPO_ROOT"
export NXF_VER=21.10.6
nextflow \
run . \
-main-script src/workflows/ingestion/bd_rhapsody/test.nf \
-entry test_wf \
-resume \
-profile docker,no_publish \
-c src/workflows/utils/labels_ci.config \
-c src/workflows/utils/integration_tests.config \
-with-trace work/trace.txt
-c src/workflows/utils/integration_tests.config

View File

@@ -6,16 +6,16 @@ include { bd_rhapsody_test } from params.rootDir + "/target/nextflow/test_workfl
params.resources_test = params.rootDir + "/resources_test"
workflow test_wf {
// allow changing the resources_test dir
resources_test = file("${params.rootDir}/resources_test")
resources_test = file(params.resources_test)
output_ch = Channel.fromList(
[
[
id: "foo",
reads: file("${params.resources_test}/bdrhap_5kjrt/raw/12*.fastq.gz"),
reference_archive: file(params.resources_test).resolve("reference_gencodev41_chr1/reference_bd_rhapsody.tar.gz"),
abseq_reference: file(params.resources_test).resolve("bdrhap_5kjrt/raw/BDAbSeq_ImmuneDiscoveryPanel.fasta"),
reference_archive: resources_test.resolve("reference_gencodev41_chr1/reference_bd_rhapsody.tar.gz"),
abseq_reference: resources_test.resolve("bdrhap_5kjrt/raw/BDAbSeq_ImmuneDiscoveryPanel.fasta"),
cell_calling_data: "mRNA",
exact_cell_count: 4900
]

View File

@@ -1,23 +1,15 @@
#!/bin/bash
# get the root of the directory
REPO_ROOT=$(git rev-parse --show-toplevel)
# ensure that the command below is run from the root of the repository
cd "$REPO_ROOT"
viash ns build -q ingestion/cellranger_mapping --setup cb --platform nextflow
export NXF_VER=21.10.6
nextflow \
run . \
-main-script src/workflows/ingestion/cellranger_mapping/test.nf \
-entry test_wf \
-resume \
-profile docker,no_publish \
-c src/workflows/utils/labels_ci.config \
-c src/workflows/utils/integration_tests.config \
-with-trace work/trace.txt
-c src/workflows/utils/integration_tests.config

View File

@@ -18,8 +18,7 @@ workflow run_wf {
toState: [
"input": "output",
"output_raw": "output"
],
auto: [ publish: true ]
]
)
// split output dir into map
| cellranger_count_split.run(
@@ -49,14 +48,9 @@ workflow run_wf {
"input_metrics_summary": state.metrics_summary
]
},
toState: { id, output, state ->
[
"output_raw": state.output_raw,
"output_h5mu": output.output
]
},
auto: [ publish: true ],
toState: ["output_h5mu": "output"]
)
| setState(["output_raw", "output_h5mu"])
emit:
output_ch

View File

@@ -7,11 +7,13 @@ params.resources_test = params.rootDir + "/resources_test"
workflow test_wf {
resources_test = file(params.resources_test)
output_ch = Channel.fromList([
[
id: "foo",
input: file(params.resources_test).resolve("cellranger_tiny_fastq/cellranger_tiny_fastq"),
reference: file(params.resources_test).resolve("cellranger_tiny_fastq/cellranger_tiny_ref"),
input: resources_test.resolve("cellranger_tiny_fastq/cellranger_tiny_fastq"),
reference: resources_test.resolve("cellranger_tiny_fastq/cellranger_tiny_ref"),
output_type: "filtered",
]
])

View File

@@ -1,32 +1,23 @@
#!/bin/bash
# get the root of the directory
REPO_ROOT=$(git rev-parse --show-toplevel)
# ensure that the command below is run from the root of the repository
cd "$REPO_ROOT"
export NXF_VER=22.10.3
nextflow \
run . \
-main-script src/workflows/ingestion/cellranger_multi/test.nf \
-entry test_wf \
-resume \
-profile no_publish,docker \
-profile docker,no_publish \
-c src/workflows/utils/labels_ci.config \
-c src/workflows/utils/integration_tests.config \
-with-trace work/trace.txt
-c src/workflows/utils/integration_tests.config
nextflow \
run . \
-main-script src/workflows/ingestion/cellranger_multi/test.nf \
-entry test_wf2 \
-resume \
-profile no_publish,docker \
-profile docker,no_publish \
-c src/workflows/utils/labels_ci.config \
-c src/workflows/utils/integration_tests.config \
-with-trace work/trace.txt
-c src/workflows/utils/integration_tests.config

View File

@@ -7,20 +7,32 @@ params.resources_test = params.rootDir + "/resources_test"
workflow test_wf {
resources_test = file(params.resources_test)
output_ch = Channel.fromList([
[
id: "foo",
input:[file(params.resources_test).resolve("10x_5k_anticmv/raw/5k_human_antiCMV_T_TBNK_connect_GEX_1_subset_S1_L001_R1_001.fastq.gz"),
file(params.resources_test).resolve("10x_5k_anticmv/raw/5k_human_antiCMV_T_TBNK_connect_GEX_1_subset_S1_L001_R2_001.fastq.gz"),
file(params.resources_test).resolve("10x_5k_anticmv/raw/5k_human_antiCMV_T_TBNK_connect_AB_subset_S2_L004_R1_001.fastq.gz"),
file(params.resources_test).resolve("10x_5k_anticmv/raw/5k_human_antiCMV_T_TBNK_connect_AB_subset_S2_L004_R2_001.fastq.gz"),
file(params.resources_test).resolve("10x_5k_anticmv/raw/5k_human_antiCMV_T_TBNK_connect_VDJ_subset_S1_L001_R1_001.fastq.gz"),
file(params.resources_test).resolve("10x_5k_anticmv/raw/5k_human_antiCMV_T_TBNK_connect_VDJ_subset_S1_L001_R2_001.fastq.gz")],
gex_reference: file(params.resources_test).resolve("reference_gencodev41_chr1/reference_cellranger.tar.gz"),
vdj_reference: file(params.resources_test).resolve("10x_5k_anticmv/raw/refdata-cellranger-vdj-GRCh38-alts-ensembl-7.0.0.tar.gz"),
feature_reference: file(params.resources_test).resolve("10x_5k_anticmv/raw/feature_reference.csv"),
library_id: ["5k_human_antiCMV_T_TBNK_connect_GEX_1_subset", "5k_human_antiCMV_T_TBNK_connect_AB_subset", "5k_human_antiCMV_T_TBNK_connect_VDJ_subset"],
library_type: ["Gene Expression", "Antibody Capture", "VDJ"]
input:[
resources_test.resolve("10x_5k_anticmv/raw/5k_human_antiCMV_T_TBNK_connect_GEX_1_subset_S1_L001_R1_001.fastq.gz"),
resources_test.resolve("10x_5k_anticmv/raw/5k_human_antiCMV_T_TBNK_connect_GEX_1_subset_S1_L001_R2_001.fastq.gz"),
resources_test.resolve("10x_5k_anticmv/raw/5k_human_antiCMV_T_TBNK_connect_AB_subset_S2_L004_R1_001.fastq.gz"),
resources_test.resolve("10x_5k_anticmv/raw/5k_human_antiCMV_T_TBNK_connect_AB_subset_S2_L004_R2_001.fastq.gz"),
resources_test.resolve("10x_5k_anticmv/raw/5k_human_antiCMV_T_TBNK_connect_VDJ_subset_S1_L001_R1_001.fastq.gz"),
resources_test.resolve("10x_5k_anticmv/raw/5k_human_antiCMV_T_TBNK_connect_VDJ_subset_S1_L001_R2_001.fastq.gz")
],
gex_reference: resources_test.resolve("reference_gencodev41_chr1/reference_cellranger.tar.gz"),
vdj_reference: resources_test.resolve("10x_5k_anticmv/raw/refdata-cellranger-vdj-GRCh38-alts-ensembl-7.0.0.tar.gz"),
feature_reference: resources_test.resolve("10x_5k_anticmv/raw/feature_reference.csv"),
library_id: [
"5k_human_antiCMV_T_TBNK_connect_GEX_1_subset",
"5k_human_antiCMV_T_TBNK_connect_AB_subset",
"5k_human_antiCMV_T_TBNK_connect_VDJ_subset"
],
library_type: [
"Gene Expression",
"Antibody Capture",
"VDJ"
]
]
])
| map{ state -> [state.id, state] }
@@ -44,8 +56,9 @@ workflow test_wf {
}
workflow test_wf2 {
// Test cell multiplexing
resources_test = file("${params.rootDir}/resources_test")
resources_test = file(params.resources_test)
output_ch = Channel.fromList([
[
id: "foo",

View File

@@ -1,7 +1,5 @@
#!/bin/bash
# get the root of the directory
REPO_ROOT=$(git rev-parse --show-toplevel)
@@ -14,9 +12,7 @@ nextflow \
-entry test_wf \
-profile docker,no_publish \
-c src/workflows/utils/labels_ci.config \
-c src/workflows/utils/integration_tests.config \
-with-trace work/trace.txt \
-resume
-c src/workflows/utils/integration_tests.config
nextflow \
run . \
@@ -24,6 +20,4 @@ nextflow \
-entry test_wf2 \
-profile docker,no_publish \
-c src/workflows/utils/labels_ci.config \
-c src/workflows/utils/integration_tests.config \
-with-trace work/trace.txt \
-resume
-c src/workflows/utils/integration_tests.config

View File

@@ -4,9 +4,10 @@ workflow run_wf {
main:
// perform correction if so desired
mid1_corrected = input_ch
| filter{ it[1].perform_correction }
output_ch = input_ch
| cellbender_remove_background.run(
runIf: {id, state -> state.perform_correction},
fromState: { id, state ->
[
input: state.input,
@@ -16,17 +17,13 @@ workflow run_wf {
]
},
toState: { id, output, state ->
state + [input: output.output, layer: "cellbender_corrected"]
state + ["input": output.output, "layer": "cellbender_corrected"]
}
)
mid1_uncorrected = input_ch
| filter{ ! it[1].perform_correction }
mid1 = mid1_corrected.mix(mid1_uncorrected)
// perform filtering if so desired
mid2_filtered = mid1
| filter{ it[1].min_genes != null || it[1].min_counts != null }
| filter_with_counts.run(
runIf: {id, state ->
state.min_genes != null || state.min_counts != null
},
fromState: { id, state ->
[
input: state.input,
@@ -39,16 +36,14 @@ workflow run_wf {
},
toState: [input: "output"]
)
mid2_unfiltered = mid1
| filter{ it[1].min_genes == null && it[1].min_counts == null }
mid2 = mid2_filtered.mix(mid2_unfiltered)
// return output map
output_ch = mid2
// Make sure to use the correct ouput file names,
// irrespective wether or not any of the above
// components were run
| publish.run(
fromState: [ input: "input", output: "output" ],
auto: [ publish: true ]
toState: ["output": "output"]
)
| setState(["output"])
emit:
output_ch

View File

@@ -8,11 +8,13 @@ params.resources_test = params.rootDir + "/resources_test"
workflow test_wf {
resources_test = file(params.resources_test)
output_ch = Channel.fromList([
[
id: "foo",
input: file(params.resources_test).resolve("pbmc_1k_protein_v3/pbmc_1k_protein_v3_filtered_feature_bc_matrix.h5mu"),
input_og: file(params.resources_test).resolve("pbmc_1k_protein_v3/pbmc_1k_protein_v3_filtered_feature_bc_matrix.h5mu"),
input: resources_test.resolve("pbmc_1k_protein_v3/pbmc_1k_protein_v3_filtered_feature_bc_matrix.h5mu"),
input_og: resources_test.resolve("pbmc_1k_protein_v3/pbmc_1k_protein_v3_filtered_feature_bc_matrix.h5mu"),
perform_correction: true,
min_genes: 100,
min_counts: 1000,
@@ -55,9 +57,8 @@ workflow test_wf {
}
workflow test_wf2 {
// allow changing the resources_test dir
resources_test = file("${params.rootDir}/resources_test")
resources_test = file(params.resources_test)
output_ch = Channel.fromList([
[

View File

@@ -5,13 +5,10 @@ REPO_ROOT=$(git rev-parse --show-toplevel)
# ensure that the command below is run from the root of the repository
cd "$REPO_ROOT"
export NXF_VER=24.04.4
nextflow \
run . \
-main-script src/workflows/ingestion/conversion/test.nf \
-entry test_wf \
-profile docker,no_publish \
-c src/workflows/utils/labels_ci.config \
-c src/workflows/utils/integration_tests.config \
-with-trace work/trace.txt
-c src/workflows/utils/integration_tests.config

View File

@@ -25,11 +25,9 @@ workflow run_wf {
}
passed_state
},
toState: {id, output, state, comp ->
["output": output.output]
},
auto: [publish: true],
toState: ["output": "output"]
)
| setState(["output": "output"])
emit:
output_ch

View File

@@ -7,30 +7,32 @@ params.resources_test = params.rootDir + "/resources_test"
workflow test_wf {
resources_test = file(params.resources_test)
output_ch = Channel.fromList([
[
id: "10xh5_test",
input: file(params.resources_test).resolve("pbmc_1k_protein_v3/pbmc_1k_protein_v3_filtered_feature_bc_matrix.h5"),
input: resources_test.resolve("pbmc_1k_protein_v3/pbmc_1k_protein_v3_filtered_feature_bc_matrix.h5"),
input_type: "10xh5",
modality: null
],
[
id: "10xmtx_test",
input: file(params.resources_test).resolve("pbmc_1k_protein_v3/pbmc_1k_protein_v3_filtered_feature_bc_matrix"),
input: resources_test.resolve("pbmc_1k_protein_v3/pbmc_1k_protein_v3_filtered_feature_bc_matrix"),
input_type: "10xmtx",
modality: null,
output: "\$id.h5mu"
],
[
id: "10xmtx",
input: file(params.resources_test).resolve("pbmc_1k_protein_v3/pbmc_1k_protein_v3_filtered_feature_bc_matrix"),
input: resources_test.resolve("pbmc_1k_protein_v3/pbmc_1k_protein_v3_filtered_feature_bc_matrix"),
input_type: "10xmtx",
modality: "rna",
output: "\$key.h5mu"
],
[
id: "h5ad",
input: file(params.resources_test).resolve("pbmc_1k_protein_v3/pbmc_1k_protein_v3_filtered_feature_bc_matrix_rna.h5ad"),
input: resources_test.resolve("pbmc_1k_protein_v3/pbmc_1k_protein_v3_filtered_feature_bc_matrix_rna.h5ad"),
input_type: "h5ad",
modality: "rna",
output: "\$key.h5mu"

View File

@@ -1,23 +1,15 @@
#!/bin/bash
# get the root of the directory
REPO_ROOT=$(git rev-parse --show-toplevel)
# ensure that the command below is run from the root of the repository
cd "$REPO_ROOT"
export NXF_VER=21.10.6
viash ns build -q 'workflows/ingestion/demux'
nextflow \
run . \
-main-script src/workflows/ingestion/demux/test.nf \
-entry test_wf \
-resume \
-profile docker,no_publish \
-with-trace work/trace.txt \
-c src/workflows/utils/labels_ci.config \
-c src/workflows/utils/integration_tests.config
-c src/workflows/utils/integration_tests.config

View File

@@ -7,24 +7,26 @@ params.resources_test = params.rootDir + "/resources_test"
workflow test_wf {
resources_test = file(params.resources_test)
// or when running from s3:
Channel.fromList([
[
id: "mkfastq_test",
input: file(params.resources_test).resolve("cellranger_tiny_bcl/bcl"),
sample_sheet: file(params.resources_test).resolve("cellranger_tiny_bcl/bcl/sample_sheet.csv"),
input: resources_test.resolve("cellranger_tiny_bcl/bcl"),
sample_sheet: resources_test.resolve("cellranger_tiny_bcl/bcl/sample_sheet.csv"),
demultiplexer: "mkfastq"
],
[
id: "bclconvert_test",
input: file(params.resources_test).resolve("cellranger_tiny_bcl/bcl2/"),
sample_sheet: file(params.resources_test).resolve("cellranger_tiny_bcl/bcl2/sample_sheet.csv"),
input: resources_test.resolve("cellranger_tiny_bcl/bcl2/"),
sample_sheet: resources_test.resolve("cellranger_tiny_bcl/bcl2/sample_sheet.csv"),
demultiplexer: "bclconvert"
],
[
id: "bcl2fastq_test",
input: file(params.resources_test).resolve("cellranger_tiny_bcl/bcl"),
sample_sheet: file(params.resources_test).resolve("cellranger_tiny_bcl/bcl/sample_sheet.csv"),
input: resources_test.resolve("cellranger_tiny_bcl/bcl"),
sample_sheet: resources_test.resolve("cellranger_tiny_bcl/bcl/sample_sheet.csv"),
demultiplexer: "bcl2fastq",
ignore_missing: true
]

View File

@@ -1,20 +1,15 @@
#!/bin/bash
# get the root of the directory
REPO_ROOT=$(git rev-parse --show-toplevel)
# ensure that the command below is run from the root of the repository
cd "$REPO_ROOT"
export NXF_VER=23.04.2
nextflow \
run . \
-main-script src/workflows/ingestion/make_reference/test.nf \
-entry test_wf \
-profile docker,no_publish \
-c src/workflows/utils/labels_ci.config \
-c src/workflows/utils/integration_tests.config \
-resume
-c src/workflows/utils/integration_tests.config

View File

@@ -6,12 +6,14 @@ params.resources_test = params.rootDir + "/resources_test"
workflow test_wf {
resources_test = file(params.resources_test)
output_ch = Channel.fromList([
[
id: "gencode_v41_ercc",
genome_fasta: file(params.resources_test).resolve("reference_gencodev41_chr1/reference.fa.gz"),
transcriptome_gtf: file(params.resources_test).resolve("reference_gencodev41_chr1/reference.gtf.gz"),
ercc: file(params.resources_test).resolve("reference_gencodev41_chr1/ERCC92.zip"),
genome_fasta: resources_test.resolve("reference_gencodev41_chr1/reference.fa.gz"),
transcriptome_gtf: resources_test.resolve("reference_gencodev41_chr1/reference.gtf.gz"),
ercc: resources_test.resolve("reference_gencodev41_chr1/ERCC92.zip"),
subset_regex: "(ERCC-00002|chr1)",
target: ["cellranger", "bd_rhapsody", "star"]
]

View File

@@ -1,25 +1,23 @@
#!/bin/bash
# get the root of the directory
REPO_ROOT=$(git rev-parse --show-toplevel)
# ensure that the command below is run from the root of the repository
cd "$REPO_ROOT"
export NXF_VER=21.10.6
nextflow run . \
nextflow \
run . \
-main-script src/workflows/integration/bbknn_leiden/test.nf \
-profile docker,no_publish \
-entry test_wf \
-profile docker,no_publish \
-c src/workflows/utils/labels_ci.config \
-c src/workflows/utils/integration_tests.config
nextflow run . \
nextflow \
run . \
-main-script src/workflows/integration/bbknn_leiden/test.nf \
-profile docker,no_publish \
-entry test_wf2 \
-profile docker,no_publish \
-c src/workflows/utils/labels_ci.config \
-c src/workflows/utils/integration_tests.config
-c src/workflows/utils/integration_tests.config

View File

@@ -86,11 +86,9 @@ workflow run_wf {
"output_compression": "gzip"
]
},
toState: { id, output, state ->
[ output: output.output ]
},
auto: [publish: true]
toState: ["output": "output"]
)
| setState(["output"])
emit:
output_ch

View File

@@ -5,16 +5,19 @@ include { bbknn_leiden } from params.rootDir + "/target/nextflow/workflows/integ
params.resources_test = params.rootDir + "/resources_test"
workflow test_wf {
resources_test = file(params.resources_test)
output_ch =
Channel.fromList([
[
id: "simple_execution_test",
input: file(params.resources_test).resolve("pbmc_1k_protein_v3/pbmc_1k_protein_v3_mms.h5mu"),
input: resources_test.resolve("pbmc_1k_protein_v3/pbmc_1k_protein_v3_mms.h5mu"),
layer: "log_normalized"
],
[
id: "no_leiden_resolutions_test",
input: file(params.resources_test).resolve("pbmc_1k_protein_v3/pbmc_1k_protein_v3_mms.h5mu"),
input: resources_test.resolve("pbmc_1k_protein_v3/pbmc_1k_protein_v3_mms.h5mu"),
layer: "log_normalized",
leiden_resolution: []
]
@@ -45,7 +48,9 @@ workflow test_wf {
}
workflow test_wf2 {
resources_test = file("${params.rootDir}/resources_test")
resources_test = file(params.resources_test)
output_ch =
Channel.fromList([
[

View File

@@ -1,27 +1,23 @@
#!/bin/bash
# get the root of the directory
REPO_ROOT=$(git rev-parse --show-toplevel)
# ensure that the command below is run from the root of the repository
cd "$REPO_ROOT"
export NXF_VER=21.10.6
nextflow run . \
nextflow \
run . \
-main-script src/workflows/integration/harmony_leiden/test.nf \
-profile docker,no_publish \
-entry test_wf \
-resume \
-profile docker,no_publish \
-c src/workflows/utils/labels_ci.config \
-c src/workflows/utils/integration_tests.config
nextflow run . \
nextflow \
run . \
-main-script src/workflows/integration/harmony_leiden/test.nf \
-profile docker,no_publish \
-entry test_wf2 \
-resume \
-profile docker,no_publish \
-c src/workflows/utils/labels_ci.config \
-c src/workflows/utils/integration_tests.config

View File

@@ -77,11 +77,9 @@ workflow run_wf {
"output_compression": "gzip"
]
},
toState: { id, output, state ->
[ output: output.output ]
},
auto: [ publish: true ]
toState: ["output": "output"]
)
| setState(["output"])
emit:
output_ch

View File

@@ -6,11 +6,13 @@ params.resources_test = params.rootDir + "/resources_test"
workflow test_wf {
resources_test = file(params.resources_test)
output_ch =
Channel.fromList([
[
id: "simple_execution_test",
input: file(params.resources_test).resolve("pbmc_1k_protein_v3/pbmc_1k_protein_v3_mms.h5mu"),
input: resources_test.resolve("pbmc_1k_protein_v3/pbmc_1k_protein_v3_mms.h5mu"),
layer: "log_normalized",
obs_covariates: "sample_id",
embedding: "X_pca",
@@ -19,7 +21,7 @@ workflow test_wf {
],
[
id: "no_leiden_resolutions_test",
input: file(params.resources_test).resolve("pbmc_1k_protein_v3/pbmc_1k_protein_v3_mms.h5mu"),
input: resources_test.resolve("pbmc_1k_protein_v3/pbmc_1k_protein_v3_mms.h5mu"),
layer: "log_normalized",
obs_covariates: "sample_id",
embedding: "X_pca",
@@ -53,7 +55,8 @@ workflow test_wf {
workflow test_wf2 {
resources_test = file("${params.rootDir}/resources_test")
resources_test = file(params.resources_test)
output_ch =
Channel.fromList([

View File

@@ -6,20 +6,18 @@ REPO_ROOT=$(git rev-parse --show-toplevel)
# ensure that the command below is run from the root of the repository
cd "$REPO_ROOT"
export NXF_VER=21.10.6
nextflow run . \
nextflow \
run . \
-main-script src/workflows/integration/scanorama_leiden/test.nf \
-profile docker,no_publish \
-entry test_wf \
-resume \
-c src/workflows/utils/labels_ci.config \
-c src/workflows/utils/integration_tests.config \
nextflow run . \
-main-script src/workflows/integration/scanorama_leiden/test.nf \
-profile docker,no_publish \
-entry test_wf2 \
-resume \
-c src/workflows/utils/labels_ci.config \
-c src/workflows/utils/integration_tests.config \
-c src/workflows/utils/integration_tests.config
nextflow \
run . \
-main-script src/workflows/integration/scanorama_leiden/test.nf \
-entry test_wf2 \
-profile docker,no_publish \
-c src/workflows/utils/labels_ci.config \
-c src/workflows/utils/integration_tests.config

View File

@@ -74,11 +74,9 @@ workflow run_wf {
"output_compression": "gzip"
]
},
auto: [ publish: true ],
toState: { id, output, state ->
[ output: output.output ]
}
toState: ["output": "output"]
)
| setState(["output"])
emit:
output_ch

View File

@@ -6,16 +6,18 @@ params.resources_test = params.rootDir + "/resources_test"
workflow test_wf {
resources_test = file(params.resources_test)
output_ch = Channel.fromList([
[
id: "simple_execution_test",
input: file(params.resources_test).resolve("pbmc_1k_protein_v3/pbmc_1k_protein_v3_mms.h5mu"),
input: resources_test.resolve("pbmc_1k_protein_v3/pbmc_1k_protein_v3_mms.h5mu"),
layer: "log_normalized",
leiden_resolution: [1.0, 0.25],
],
[
id: "no_leiden_resolutions_test",
input: file(params.resources_test).resolve("pbmc_1k_protein_v3/pbmc_1k_protein_v3_mms.h5mu"),
input: resources_test.resolve("pbmc_1k_protein_v3/pbmc_1k_protein_v3_mms.h5mu"),
layer: "log_normalized",
leiden_resolution: [],
]
@@ -46,8 +48,8 @@ workflow test_wf {
}
workflow test_wf2 {
// allow changing the resources_test dir
resources_test = file("${params.rootDir}/resources_test")
resources_test = file(params.resources_test)
output_ch = Channel.fromList([
[

View File

@@ -1,29 +1,23 @@
#!/bin/bash
# get the root of the directory
REPO_ROOT=$(git rev-parse --show-toplevel)
# ensure that the command below is run from the root of the repository
cd "$REPO_ROOT"
export NXF_VER=21.10.6
viash ns build -q scgpt_leiden
nextflow run . \
nextflow \
run . \
-main-script src/workflows/integration/scgpt_leiden/test.nf \
-profile docker,no_publish \
-entry test_wf \
-resume \
-profile docker,no_publish \
-c src/workflows/utils/labels_ci.config \
-c src/workflows/utils/integration_tests.config
nextflow run . \
nextflow \
run . \
-main-script src/workflows/integration/scgpt_leiden/test.nf \
-profile docker,no_publish \
-entry test_wf2 \
-resume \
-profile docker,no_publish \
-c src/workflows/utils/labels_ci.config \
-c src/workflows/utils/integration_tests.config \
-c src/workflows/utils/integration_tests.config

View File

@@ -152,11 +152,9 @@ workflow run_wf {
"output": state.workflow_output
]
},
toState: { id, output, state ->
[ output: output.output ]
},
auto: [ publish: true ]
toState: ["output": "output"]
)
| setState(["output"])
emit:
output_ch

View File

@@ -6,30 +6,32 @@ params.resources_test = params.rootDir + "/resources_test"
workflow test_wf {
resources_test = file(params.resources_test)
output_ch = Channel.fromList([
[
id: "simple_execution_test",
input: file(params.resources_test).resolve("scgpt/test_resources/Kim2020_Lung_subset_preprocessed.h5mu"),
model: file(params.resources_test).resolve("scgpt/source/best_model.pt"),
model_config: file(params.resources_test).resolve("scgpt/source/args.json"),
model_vocab: file(params.resources_test).resolve("scgpt/source/vocab.json"),
input_layer: "log_normalized",
obs_batch_label: "sample",
n_hvg: 400,
seed: 1,
leiden_resolution: [1.0, 0.25]
id: "simple_execution_test",
input: resources_test.resolve("scgpt/test_resources/Kim2020_Lung_subset_preprocessed.h5mu"),
model: resources_test.resolve("scgpt/source/best_model.pt"),
model_config: resources_test.resolve("scgpt/source/args.json"),
model_vocab: resources_test.resolve("scgpt/source/vocab.json"),
input_layer: "log_normalized",
obs_batch_label: "sample",
n_hvg: 400,
seed: 1,
leiden_resolution: [1.0, 0.25]
],
[
id: "no_leiden_resolutions_test",
input: file(params.resources_test).resolve("scgpt/test_resources/Kim2020_Lung_subset_preprocessed.h5mu"),
model: file(params.resources_test).resolve("scgpt/source/best_model.pt"),
model_config: file(params.resources_test).resolve("scgpt/source/args.json"),
model_vocab: file(params.resources_test).resolve("scgpt/source/vocab.json"),
obs_batch_label: "sample",
n_hvg: 400,
seed: 1,
input_layer: "log_normalized",
leiden_resolution: []
id: "no_leiden_resolutions_test",
input: resources_test.resolve("scgpt/test_resources/Kim2020_Lung_subset_preprocessed.h5mu"),
model: resources_test.resolve("scgpt/source/best_model.pt"),
model_config: resources_test.resolve("scgpt/source/args.json"),
model_vocab: resources_test.resolve("scgpt/source/vocab.json"),
obs_batch_label: "sample",
n_hvg: 400,
seed: 1,
input_layer: "log_normalized",
leiden_resolution: []
]
])
| map{ state -> [state.id, state] }
@@ -59,7 +61,8 @@ workflow test_wf {
workflow test_wf2 {
resources_test = file("${params.rootDir}/resources_test/scgpt")
resources_test = file(params.resources_test)
output_ch = Channel.fromList([
[

View File

@@ -1,19 +1,15 @@
#!/bin/bash
# get the root of the directory
REPO_ROOT=$(git rev-parse --show-toplevel)
# ensure that the command below is run from the root of the repository
cd "$REPO_ROOT"
export NXF_VER=21.10.6
nextflow run . \
nextflow \
run . \
-main-script src/workflows/integration/scvi_leiden/test.nf \
-profile docker,no_publish \
-entry test_wf \
-profile docker,no_publish \
-c src/workflows/utils/labels_ci.config \
-c src/workflows/utils/integration_tests.config \
-resume
-c src/workflows/utils/integration_tests.config

View File

@@ -98,14 +98,9 @@ workflow run_wf {
"output_compression": "gzip"
]
},
auto: [ publish: true ],
toState: { id, output, state ->
[
output: output.output,
output_model: state.output_model
]
}
toState: ["output": "output"]
)
| setState(["output", "output_model"])
emit:
output_ch

View File

@@ -6,10 +6,12 @@ params.resources_test = params.rootDir + "/resources_test"
workflow test_wf {
resources_test = file(params.resources_test)
output_ch = Channel.fromList([
[
id: "simple_execution_test",
input: file(params.resources_test).resolve("pbmc_1k_protein_v3/pbmc_1k_protein_v3_mms.h5mu"),
input: resources_test.resolve("pbmc_1k_protein_v3/pbmc_1k_protein_v3_mms.h5mu"),
layer: "log_normalized",
obs_batch: "sample_id",
max_epochs: 1,
@@ -17,7 +19,7 @@ workflow test_wf {
],
[
id: "no_leiden_resolutions_test",
input: file(params.resources_test).resolve("pbmc_1k_protein_v3/pbmc_1k_protein_v3_mms.h5mu"),
input: resources_test.resolve("pbmc_1k_protein_v3/pbmc_1k_protein_v3_mms.h5mu"),
layer: "log_normalized",
obs_batch: "sample_id",
output_model: "no_leiden_resolutions_test_model/",

View File

@@ -1,18 +1,15 @@
#!/bin/bash
# get the root of the directory
REPO_ROOT=$(git rev-parse --show-toplevel)
# ensure that the command below is run from the root of the repository
cd "$REPO_ROOT"
export NXF_VER=23.04.2
nextflow run . \
nextflow \
run . \
-main-script src/workflows/integration/totalvi_leiden/test.nf \
-profile docker,no_publish \
-entry test_wf \
-profile docker,no_publish \
-c src/workflows/utils/labels_ci.config \
-c src/workflows/utils/integration_tests.config

View File

@@ -139,15 +139,9 @@ workflow run_wf {
"compression": "gzip"
]
},
toState: { id, output, state ->
[
output: output.output,
reference_model_path: state.reference_model_path,
query_model_path: state.query_model_path
]
},
auto: [ publish: true ]
toState: ["output", "output"]
)
| setState(["output", "reference_model_path", "query_model_path"])
emit:
output_ch
}

View File

@@ -6,11 +6,13 @@ params.resources_test = params.rootDir + "/resources_test"
workflow test_wf {
resources_test = file(params.resources_test)
output_ch = Channel.fromList([
[
id: "simple_execution_test",
input: file(params.resources_test).resolve("pbmc_1k_protein_v3/pbmc_1k_protein_v3_mms.h5mu"),
reference: file(params.resources_test).resolve("pbmc_1k_protein_v3/pbmc_1k_protein_v3_mms.h5mu"),
input: resources_test.resolve("pbmc_1k_protein_v3/pbmc_1k_protein_v3_mms.h5mu"),
reference: resources_test.resolve("pbmc_1k_protein_v3/pbmc_1k_protein_v3_mms.h5mu"),
prot_modality: "prot",
prot_reference_modality: "prot",
var_input: "filter_with_hvg",
@@ -21,8 +23,8 @@ workflow test_wf {
],
[
id: "no_prot_leiden_resolutions_test",
input: file(params.resources_test).resolve("pbmc_1k_protein_v3/pbmc_1k_protein_v3_mms.h5mu"),
reference: file(params.resources_test).resolve("pbmc_1k_protein_v3/pbmc_1k_protein_v3_mms.h5mu"),
input: resources_test.resolve("pbmc_1k_protein_v3/pbmc_1k_protein_v3_mms.h5mu"),
reference: resources_test.resolve("pbmc_1k_protein_v3/pbmc_1k_protein_v3_mms.h5mu"),
prot_modality: "prot",
prot_reference_modality: "prot",
var_input: "filter_with_hvg",
@@ -34,8 +36,8 @@ workflow test_wf {
],
[
id: "no_rna_leiden_resolutions_test",
input: file(params.resources_test).resolve("pbmc_1k_protein_v3/pbmc_1k_protein_v3_mms.h5mu"),
reference: file(params.resources_test).resolve("pbmc_1k_protein_v3/pbmc_1k_protein_v3_mms.h5mu"),
input: resources_test.resolve("pbmc_1k_protein_v3/pbmc_1k_protein_v3_mms.h5mu"),
reference: resources_test.resolve("pbmc_1k_protein_v3/pbmc_1k_protein_v3_mms.h5mu"),
prot_modality: "prot",
prot_reference_modality: "prot",
var_input: "filter_with_hvg",

View File

@@ -1,19 +1,15 @@
#!/bin/bash
# get the root of the directory
REPO_ROOT=$(git rev-parse --show-toplevel)
# ensure that the command below is run from the root of the repository
cd "$REPO_ROOT"
export NXF_VER=21.10.6
nextflow run . \
nextflow \
run . \
-main-script src/workflows/multiomics/dimensionality_reduction/test.nf \
-profile docker,no_publish \
-entry test_wf \
-profile docker,no_publish \
-c src/workflows/utils/labels_ci.config \
-c src/workflows/utils/integration_tests.config \
-resume
-c src/workflows/utils/integration_tests.config

View File

@@ -44,11 +44,9 @@ workflow run_wf {
"output_compression": "gzip"
]
},
toState: { id, output, state ->
[ output: output.output ]
},
auto: [ publish: true ]
toState: ["output": "output"]
)
| setState(["output"])
emit:
output_ch

View File

@@ -6,18 +6,19 @@ include { dimensionality_reduction_test } from params.rootDir + "/target/nextflo
params.resources_test = params.rootDir + "/resources_test"
workflow test_wf {
// allow changing the resources_test dir
resources_test = file(params.resources_test)
input_ch = Channel.fromList([
[
id: "simple_execution_test",
input: file(params.resources_test).resolve("concat_test_data/concatenated_brain_filtered_feature_bc_matrix_subset.h5mu"),
input: resources_test.resolve("concat_test_data/concatenated_brain_filtered_feature_bc_matrix_subset.h5mu"),
layer: "",
output: "foo.final.h5mu"
],
[
id: "pca_obsm_output_test",
input: file(params.resources_test).resolve("concat_test_data/concatenated_brain_filtered_feature_bc_matrix_subset.h5mu"),
input: resources_test.resolve("concat_test_data/concatenated_brain_filtered_feature_bc_matrix_subset.h5mu"),
layer: "",
output: "foo.final.h5mu"
],

View File

@@ -1,26 +1,18 @@
#!/bin/bash
# get the root of the directory
REPO_ROOT=$(git rev-parse --show-toplevel)
# ensure that the command below is run from the root of the repository
cd "$REPO_ROOT"
viash ns build -q process_batches
export NXF_VER=24.04.4
nextflow \
run . \
-main-script src/workflows/multiomics/process_batches/test.nf \
-entry test_wf \
-profile docker,no_publish \
-c src/workflows/utils/labels_ci.config \
-c src/workflows/utils/integration_tests.config \
-resume
-c src/workflows/utils/integration_tests.config
nextflow \
run . \
@@ -28,5 +20,4 @@ nextflow \
-entry test_wf2 \
-profile docker,no_publish \
-c src/workflows/utils/labels_ci.config \
-c src/workflows/utils/integration_tests.config \
-resume
-c src/workflows/utils/integration_tests.config

View File

@@ -215,7 +215,7 @@ workflow run_wf {
"output": state.workflow_output,
]
},
auto: [publish: true]
toState: ["output": "output"]
)
| setState(["output"])

View File

@@ -9,16 +9,18 @@ params.resources_test = params.rootDir + "/resources_test"
workflow test_wf {
resources_test = file(params.resources_test)
input_ch = Channel.fromList([
[
id: "test",
input: file(params.resources_test).resolve("concat_test_data/concatenated_brain_filtered_feature_bc_matrix_subset.h5mu"),
input: resources_test.resolve("concat_test_data/concatenated_brain_filtered_feature_bc_matrix_subset.h5mu"),
publish_dir: "foo/",
clr_axis: 0
],
[
id: "test2",
input: file(params.resources_test).resolve("concat_test_data/concatenated_brain_filtered_feature_bc_matrix_subset.h5mu"),
input: resources_test.resolve("concat_test_data/concatenated_brain_filtered_feature_bc_matrix_subset.h5mu"),
publish_dir: "foo/",
clr_axis: 1
]
@@ -52,8 +54,8 @@ workflow test_wf {
}
workflow test_wf2 {
// allow changing the resources_test dir
resources_test = file("${params.rootDir}/resources_test")
resources_test = file(params.resources_test)
input_ch = Channel.fromList([
[

View File

@@ -8,15 +8,10 @@ REPO_ROOT=$(git rev-parse --show-toplevel)
# ensure that the command below is run from the root of the repository
cd "$REPO_ROOT"
export NXF_VER=23.10.3
viash ns build -q '^workflows'
nextflow \
run . \
-main-script src/workflows/multiomics/process_samples/test.nf \
-entry test_wf \
-resume \
-profile docker,no_publish \
-c src/workflows/utils/labels_ci.config \
-c src/workflows/utils/integration_tests.config
@@ -26,7 +21,6 @@ nextflow \
run . \
-main-script src/workflows/multiomics/process_samples/test.nf \
-entry test_wf \
-resume \
-profile docker,no_publish \
-c src/workflows/utils/labels_ci.config \
-c src/workflows/utils/integration_tests.config \
@@ -37,7 +31,6 @@ nextflow \
run . \
-main-script src/workflows/multiomics/process_samples/test.nf \
-entry test_wf \
-resume \
-profile docker,no_publish \
-c src/workflows/utils/labels_ci.config \
-c src/workflows/utils/integration_tests.config \
@@ -48,7 +41,6 @@ nextflow \
run . \
-main-script src/workflows/multiomics/process_samples/test.nf \
-entry test_wf \
-resume \
-profile docker,no_publish \
-c src/workflows/utils/labels_ci.config \
-c src/workflows/utils/integration_tests.config \
@@ -57,9 +49,8 @@ nextflow \
nextflow \
run . \
-main-script src/workflows/multiomics/process_samples/test.nf \
-entry test_wf2 \
-resume \
-profile docker,no_publish \
-entry test_wf2 \
-c src/workflows/utils/labels_ci.config \
-c src/workflows/utils/integration_tests.config
@@ -67,7 +58,6 @@ nextflow \
run . \
-main-script src/workflows/multiomics/process_samples/test.nf \
-entry test_wf3 \
-resume \
-profile docker,no_publish \
-c src/workflows/utils/labels_ci.config \
-c src/workflows/utils/integration_tests.config
@@ -76,7 +66,6 @@ nextflow \
run . \
-main-script src/workflows/multiomics/process_samples/test.nf \
-entry test_wf4 \
-resume \
-profile docker,no_publish \
-c src/workflows/utils/labels_ci.config \
-c src/workflows/utils/integration_tests.config
@@ -85,7 +74,6 @@ nextflow \
run . \
-main-script src/workflows/multiomics/process_samples/test.nf \
-entry test_wf5 \
-resume \
-profile docker,no_publish \
-c src/workflows/utils/labels_ci.config \
-c src/workflows/utils/integration_tests.config
@@ -94,7 +82,6 @@ nextflow \
run . \
-main-script src/workflows/multiomics/process_samples/test.nf \
-entry test_wf6 \
-resume \
-profile docker,no_publish \
-c src/workflows/utils/labels_ci.config \
-c src/workflows/utils/integration_tests.config
@@ -103,7 +90,6 @@ nextflow \
run . \
-main-script src/workflows/multiomics/process_samples/test.nf \
-entry test_wf7 \
-resume \
-profile docker,no_publish \
-c src/workflows/utils/labels_ci.config \
-c src/workflows/utils/integration_tests.config
-c src/workflows/utils/integration_tests.config

View File

@@ -9,17 +9,19 @@ params.resources_test = params.rootDir + "/resources_test"
workflow test_wf {
resources_test = file(params.resources_test)
output_ch = Channel.fromList([
[
id: "mouse",
input: file(params.resources_test).resolve("concat_test_data/e18_mouse_brain_fresh_5k_filtered_feature_bc_matrix_subset_unique_obs.h5mu"),
input: resources_test.resolve("concat_test_data/e18_mouse_brain_fresh_5k_filtered_feature_bc_matrix_subset_unique_obs.h5mu"),
publish_dir: "foo/",
rna_min_counts: 2,
output: "test.h5mu",
],
[
id: "human",
input: file(params.resources_test).resolve("concat_test_data/human_brain_3k_filtered_feature_bc_matrix_subset_unique_obs.h5mu"),
input: resources_test.resolve("concat_test_data/human_brain_3k_filtered_feature_bc_matrix_subset_unique_obs.h5mu"),
publish_dir: "foo/",
rna_min_counts: 2,
output: "test.h5mu",
@@ -42,8 +44,8 @@ workflow test_wf {
}
workflow test_wf2 {
// allow changing the resources_test dir
resources_test = file("${params.rootDir}/resources_test")
resources_test = file(params.resources_test)
output_ch = Channel.fromList([
[
@@ -94,8 +96,8 @@ workflow test_wf2 {
}
workflow test_wf3 {
// allow changing the resources_test dir
resources_test = file("${params.rootDir}/resources_test")
resources_test = file(params.resources_test)
input_ch = Channel.fromList([
[
@@ -156,8 +158,8 @@ workflow test_wf3 {
}
workflow test_wf4 {
// allow changing the resources_test dir
resources_test = file("${params.rootDir}/resources_test")
resources_test = file(params.resources_test)
output_ch = Channel.fromList([
[
@@ -187,8 +189,8 @@ workflow test_wf4 {
}
workflow test_wf5 {
// allow changing the resources_test dir
resources_test = file("${params.rootDir}/resources_test")
resources_test = file(params.resources_test)
output_ch = Channel.fromList([
[
@@ -235,8 +237,8 @@ workflow test_wf5 {
}
workflow test_wf6 {
// allow changing the resources_test dir
resources_test = file("${params.rootDir}/resources_test")
resources_test = file(params.resources_test)
output_ch = Channel.fromList([
[
@@ -298,8 +300,8 @@ workflow test_wf6 {
// }
workflow test_wf7 {
// allow changing the resources_test dir
resources_test = file("${params.rootDir}/resources_test")
resources_test = file(params.resources_test)
output_ch = Channel.fromList([
[

View File

@@ -4,12 +4,10 @@ REPO_ROOT=$(git rev-parse --show-toplevel)
# ensure that the command below is run from the root of the repository
cd "$REPO_ROOT"
nextflow run . \
-main-script src/workflows/multiomics/split_modalities/test.nf \
-entry test_wf \
-resume \
-profile docker \
-c src/workflows/utils/labels_ci.config \
-c src/workflows/utils/integration_tests.config
nextflow \
run . \
-main-script src/workflows/multiomics/split_modalities/test.nf \
-entry test_wf \
-profile docker,no_publish \
-c src/workflows/utils/labels_ci.config \
-c src/workflows/utils/integration_tests.config

View File

@@ -7,10 +7,12 @@ params.resources_test = params.rootDir + "/resources_test"
workflow test_wf {
resources_test = file(params.resources_test)
output_ch = Channel.fromList([
[
id: "mouse",
input: file(params.resources_test).resolve("pbmc_1k_protein_v3/pbmc_1k_protein_v3_filtered_feature_bc_matrix.h5mu"),
input: resources_test.resolve("pbmc_1k_protein_v3/pbmc_1k_protein_v3_filtered_feature_bc_matrix.h5mu"),
publish_dir: "foo/",
output: "modalities",
output_types: "types.csv"

View File

@@ -6,12 +6,10 @@ REPO_ROOT=$(git rev-parse --show-toplevel)
# ensure that the command below is run from the root of the repository
cd "$REPO_ROOT"
viash ns build -q prot_multisample
nextflow run . \
nextflow \
run . \
-main-script src/workflows/prot/prot_multisample/test.nf \
-profile docker,no_publish \
-entry test_wf \
-with-trace work/trace.txt \
-profile docker,no_publish \
-c src/workflows/utils/labels_ci.config \
-c src/workflows/utils/integration_tests.config
-c src/workflows/utils/integration_tests.config

View File

@@ -6,17 +6,19 @@ params.resources_test = params.rootDir + "/resources_test"
workflow test_wf {
resources_test = file(params.resources_test)
output_ch = Channel.fromList([
[
id: "adt_samples_axis_0",
sample_id: "pbmc",
input: file(params.resources_test).resolve("pbmc_1k_protein_v3/pbmc_1k_protein_v3_mms.h5mu"),
input: resources_test.resolve("pbmc_1k_protein_v3/pbmc_1k_protein_v3_mms.h5mu"),
clr_axis: 0
],
[
id: "adt_samples_axis_1",
sample_id: "pbmc",
input: file(params.resources_test).resolve("pbmc_1k_protein_v3/pbmc_1k_protein_v3_mms.h5mu"),
input: resources_test.resolve("pbmc_1k_protein_v3/pbmc_1k_protein_v3_mms.h5mu"),
clr_axis: 1
]
])

View File

@@ -6,15 +6,10 @@ REPO_ROOT=$(git rev-parse --show-toplevel)
# ensure that the command below is run from the root of the repository
cd "$REPO_ROOT"
export NXF_VER=21.10.6
viash ns build -q prot_singlesample
nextflow run . \
nextflow \
run . \
-main-script src/workflows/multiomics/prot_singlesample/test.nf \
-profile docker,no_publish \
-resume \
-entry test_wf \
-with-trace work/trace.txt \
-profile docker,no_publish \
-c src/workflows/utils/labels_ci.config \
-c src/workflows/utils/integration_tests.config
-c src/workflows/utils/integration_tests.config

View File

@@ -44,7 +44,6 @@ workflow run_wf {
return newState
},
toState: ["output": "output"],
auto: [ publish: true ]
)
| setState(["output"])

View File

@@ -6,10 +6,12 @@ params.resources_test = params.rootDir + "/resources_test"
workflow test_wf {
resources_test = file(params.resources_test)
output_ch = Channel.fromList([
[
id: "foo",
input: file(params.resources_test).resolve("pbmc_1k_protein_v3/pbmc_1k_protein_v3_filtered_feature_bc_matrix.h5mu"),
input: resources_test.resolve("pbmc_1k_protein_v3/pbmc_1k_protein_v3_filtered_feature_bc_matrix.h5mu"),
min_counts: 3,
max_counts: 100000,
min_genes_per_cell: 2,

View File

@@ -1,22 +1,15 @@
#!/bin/bash
# get the root of the directory
REPO_ROOT=$(git rev-parse --show-toplevel)
# ensure that the command below is run from the root of the repository
cd "$REPO_ROOT"
export NXF_VER=21.10.6
viash ns build -q '^workflows/qc/qc'
nextflow \
run . \
-main-script src/workflows/qc/qc/test.nf \
-entry test_wf \
-resume \
-profile docker,no_publish \
-c src/workflows/utils/labels_ci.config \
-c src/workflows/utils/integration_tests.config

View File

@@ -81,7 +81,7 @@ workflow run_wf {
"compression": "gzip"
]
},
auto: [ publish: true ]
toState: ["output": "output"]
)
| setState(["output"])

View File

@@ -7,15 +7,17 @@ params.resources_test = params.rootDir + "/resources_test"
workflow test_wf {
resources_test = file(params.resources_test)
output_ch =
Channel.fromList([
[
id: "mouse_test",
input: file(params.resources_test).resolve("concat_test_data/e18_mouse_brain_fresh_5k_filtered_feature_bc_matrix_subset_unique_obs.h5mu"),
input: resources_test.resolve("concat_test_data/e18_mouse_brain_fresh_5k_filtered_feature_bc_matrix_subset_unique_obs.h5mu"),
],
[
id: "human_test",
input: file(params.resources_test).resolve("concat_test_data/human_brain_3k_filtered_feature_bc_matrix_subset_unique_obs.h5mu"),
input: resources_test.resolve("concat_test_data/human_brain_3k_filtered_feature_bc_matrix_subset_unique_obs.h5mu"),
]
])
| map { state -> [state.id, state] }

View File

@@ -1,22 +1,15 @@
#!/bin/bash
# get the root of the directory
REPO_ROOT=$(git rev-parse --show-toplevel)
# ensure that the command below is run from the root of the repository
cd "$REPO_ROOT"
viash ns build -q rna_multisample
export NXF_VER=21.10.6
nextflow run . \
nextflow \
run . \
-main-script src/workflows/rna/rna_multisample/test.nf \
-profile docker,no_publish \
-resume \
-entry test_wf \
-with-trace work/trace.txt \
-profile docker,no_publish \
-c src/workflows/utils/labels_ci.config \
-c src/workflows/utils/integration_tests.config

View File

@@ -6,10 +6,12 @@ params.resources_test = params.rootDir + "/resources_test"
workflow test_wf {
resources_test = file(params.resources_test)
output_ch = Channel.fromList([
[
id: "simple_execution_test",
input: file(params.resources_test).resolve("concat_test_data/concatenated_brain_filtered_feature_bc_matrix_subset.h5mu"),
input: resources_test.resolve("concat_test_data/concatenated_brain_filtered_feature_bc_matrix_subset.h5mu"),
output: "concatenated_file.final.h5mu"
]
])

View File

@@ -1,30 +1,23 @@
#!/bin/bash
# get the root of the directory
REPO_ROOT=$(git rev-parse --show-toplevel)
# ensure that the command below is run from the root of the repository
cd "$REPO_ROOT"
export NXF_VER=21.10.6
viash ns build -q rna_singlesample
# viash ns build -q 'filter|publish|qc|metadata' --parallel --setup cb
nextflow run . \
nextflow \
run . \
-main-script src/workflows/rna/rna_singlesample/test.nf \
-profile docker,no_publish \
-entry test_wf \
-with-trace work/trace.txt \
-profile docker,no_publish \
-c src/workflows/utils/labels_ci.config \
-c src/workflows/utils/integration_tests.config
nextflow run . \
nextflow \
run . \
-entry test_wf2 \
-main-script src/workflows/rna/rna_singlesample/test.nf \
-profile docker,no_publish \
-entry test_wf2 \
-with-trace work/trace.txt \
-c src/workflows/utils/labels_ci.config \
-c src/workflows/utils/integration_tests.config

View File

@@ -133,8 +133,8 @@ workflow run_wf {
"layer": "layer",
],
args: [output_compression: "gzip"],
auto: [ publish: true ]
)
| setState(["output": "output"])
emit:
output_ch

View File

@@ -5,13 +5,13 @@ include { rna_singlesample } from params.rootDir + "/target/nextflow/workflows/r
params.resources_test = params.rootDir + "/resources_test"
workflow test_wf {
// allow changing the resources_test dir
resources_test = file("${params.rootDir}/resources_test")
resources_test = file(params.resources_test)
output_ch = Channel.fromList([
[
id: "mitochondrial_test",
input: file(params.resources_test).resolve("pbmc_1k_protein_v3/pbmc_1k_protein_v3_filtered_feature_bc_matrix.h5mu"),
input: resources_test.resolve("pbmc_1k_protein_v3/pbmc_1k_protein_v3_filtered_feature_bc_matrix.h5mu"),
min_counts: 3,
max_counts: 10000000,
min_genes_per_cell: 2,
@@ -27,7 +27,7 @@ workflow test_wf {
],
[
id: "simple_execution_test",
input: file(params.resources_test).resolve("pbmc_1k_protein_v3/pbmc_1k_protein_v3_filtered_feature_bc_matrix.h5mu"),
input: resources_test.resolve("pbmc_1k_protein_v3/pbmc_1k_protein_v3_filtered_feature_bc_matrix.h5mu"),
min_counts: 3,
max_counts: 10000000,
min_genes_per_cell: 2,
@@ -54,8 +54,8 @@ workflow test_wf {
}
workflow test_wf2 {
// allow changing the resources_test dir
resources_test = file("${params.rootDir}/resources_test")
resources_test = file(params.resources_test)
output_ch = Channel.fromList([
[

View File

@@ -39,4 +39,6 @@ engines:
__merge__: /src/base/requirements/viashpy.yaml
runners:
- type: executable
- type: nextflow
- type: nextflow
directives:
label: [midmem, midcpu]

View File

@@ -0,0 +1,427 @@
name: "celltypist"
namespace: "annotate"
version: "fix-integration-tests"
authors:
- name: "Jakub Majercik"
roles:
- "author"
info:
role: "Contributor"
links:
email: "jakub@data-intuitive.com"
github: "jakubmajercik"
linkedin: "jakubmajercik"
organizations:
- name: "Data Intuitive"
href: "https://www.data-intuitive.com"
role: "Bioinformatics Engineer"
- name: "Weiwei Schultz"
roles:
- "contributor"
info:
role: "Contributor"
organizations:
- name: "Janssen R&D US"
role: "Associate Director Data Sciences"
argument_groups:
- name: "Inputs"
description: "Input dataset (query) arguments"
arguments:
- type: "file"
name: "--input"
alternatives:
- "-i"
description: "The input (query) data to be labeled. Should be a .h5mu file."
info: null
example:
- "input.h5mu"
must_exist: true
create_parent: true
required: true
direction: "input"
multiple: false
multiple_sep: ";"
- type: "string"
name: "--modality"
description: "Which modality to process."
info: null
default:
- "rna"
required: false
direction: "input"
multiple: false
multiple_sep: ";"
- type: "string"
name: "--input_layer"
description: "The layer in the input data to be used for cell type annotation\
\ if .X is not to be used."
info: null
required: false
direction: "input"
multiple: false
multiple_sep: ";"
- type: "string"
name: "--var_query_gene_names"
description: "The name of the adata var column in the input data containing gene\
\ names; when no gene_name_layer is provided, the var index will be used.\n"
info: null
required: false
direction: "input"
multiple: false
multiple_sep: ";"
- name: "Reference"
description: "Arguments related to the reference dataset."
arguments:
- type: "file"
name: "--reference"
description: "The reference data to train the CellTypist classifiers on. Only\
\ required if a pre-trained --model is not provided."
info: null
example:
- "reference.h5mu"
must_exist: true
create_parent: true
required: false
direction: "input"
multiple: false
multiple_sep: ";"
- type: "string"
name: "--reference_layer"
description: "The layer in the reference data to be used for cell type annotation\
\ if .X is not to be used. Data are expected to be processed in the same way\
\ as the --input query dataset."
info: null
required: false
direction: "input"
multiple: false
multiple_sep: ";"
- type: "string"
name: "--reference_obs_target"
description: "The name of the adata obs column in the reference data containing\
\ cell type annotations."
info: null
default:
- "cell_ontology_class"
required: false
direction: "input"
multiple: false
multiple_sep: ";"
- type: "boolean_true"
name: "--check_expression"
description: "Whether to check the expression of the reference dataset to the\
\ format reccomended by CellTypist.\nCellTypist requires data to be log-normalized\
\ to 10000 counts per cell.\n"
info: null
direction: "input"
- type: "string"
name: "--var_reference_gene_names"
description: "The name of the adata var column in the reference data containing\
\ gene names; when no gene_name_layer is provided, the var index will be used.\n"
info: null
required: false
direction: "input"
multiple: false
multiple_sep: ";"
- name: "Model arguments"
description: "Model arguments."
arguments:
- type: "file"
name: "--model"
description: "Pretrained model in pkl format. If not provided, the model will\
\ be trained on the reference data and --reference should be provided."
info: null
example:
- "pretrained_model.pkl"
must_exist: true
create_parent: true
required: false
direction: "input"
multiple: false
multiple_sep: ";"
- type: "boolean"
name: "--feature_selection"
description: "Whether to perform feature selection."
info: null
default:
- false
required: false
direction: "input"
multiple: false
multiple_sep: ";"
- type: "boolean"
name: "--majority_voting"
description: "Whether to refine the predicted labels by running the majority voting\
\ classifier after over-clustering."
info: null
default:
- false
required: false
direction: "input"
multiple: false
multiple_sep: ";"
- type: "double"
name: "--C"
description: "Inverse of regularization strength in logistic regression."
info: null
default:
- 1.0
required: false
direction: "input"
multiple: false
multiple_sep: ";"
- type: "integer"
name: "--max_iter"
description: "Maximum number of iterations before reaching the minimum of the\
\ cost function."
info: null
default:
- 1000
required: false
direction: "input"
multiple: false
multiple_sep: ";"
- type: "boolean_true"
name: "--use_SGD"
description: "Whether to use the stochastic gradient descent algorithm."
info: null
direction: "input"
- type: "double"
name: "--min_prop"
description: "\"For the dominant cell type within a subcluster, the minimum proportion\
\ of cells required to \nsupport naming of the subcluster by this cell type.\
\ Ignored if majority_voting is set to False. \nSubcluster that fails to pass\
\ this proportion threshold will be assigned 'Heterogeneous'.\"\n"
info: null
default:
- 0.0
required: false
direction: "input"
multiple: false
multiple_sep: ";"
- name: "Outputs"
description: "Output arguments."
arguments:
- type: "file"
name: "--output"
description: "Output h5mu file."
info: null
example:
- "output.h5mu"
must_exist: true
create_parent: true
required: false
direction: "output"
multiple: false
multiple_sep: ";"
- type: "string"
name: "--output_compression"
info: null
example:
- "gzip"
required: false
choices:
- "gzip"
- "lzf"
direction: "input"
multiple: false
multiple_sep: ";"
- type: "string"
name: "--output_obs_predictions"
description: "In which `.obs` slots to store the predicted information.\n"
info: null
default:
- "celltypist_pred"
required: false
direction: "input"
multiple: false
multiple_sep: ";"
- type: "string"
name: "--output_obs_probability"
description: "In which `.obs` slots to store the probability of the predictions.\n"
info: null
default:
- "celltypist_probability"
required: false
direction: "input"
multiple: false
multiple_sep: ";"
resources:
- type: "python_script"
path: "script.py"
is_executable: true
- type: "file"
path: "setup_logger.py"
- type: "file"
path: "nextflow_labels.config"
dest: "nextflow_labels.config"
description: "Automated cell type annotation tool for scRNA-seq datasets on the basis\
\ of logistic regression classifiers optimised by the stochastic gradient descent\
\ algorithm."
test_resources:
- type: "python_script"
path: "test.py"
is_executable: true
- type: "file"
path: "annotation_test_data"
- type: "file"
path: "pbmc_1k_protein_v3"
- type: "file"
path: "openpipelinetestutils"
dest: "openpipelinetestutils"
info: null
status: "enabled"
links:
repository: "https://github.com/openpipelines-bio/openpipeline"
docker_registry: "ghcr.io"
runners:
- type: "executable"
id: "executable"
docker_setup_strategy: "ifneedbepullelsecachedbuild"
- type: "nextflow"
id: "nextflow"
directives:
tag: "$id"
auto:
simplifyInput: true
simplifyOutput: false
transcript: false
publish: false
config:
labels:
mem1gb: "memory = 1000000000.B"
mem2gb: "memory = 2000000000.B"
mem5gb: "memory = 5000000000.B"
mem10gb: "memory = 10000000000.B"
mem20gb: "memory = 20000000000.B"
mem50gb: "memory = 50000000000.B"
mem100gb: "memory = 100000000000.B"
mem200gb: "memory = 200000000000.B"
mem500gb: "memory = 500000000000.B"
mem1tb: "memory = 1000000000000.B"
mem2tb: "memory = 2000000000000.B"
mem5tb: "memory = 5000000000000.B"
mem10tb: "memory = 10000000000000.B"
mem20tb: "memory = 20000000000000.B"
mem50tb: "memory = 50000000000000.B"
mem100tb: "memory = 100000000000000.B"
mem200tb: "memory = 200000000000000.B"
mem500tb: "memory = 500000000000000.B"
mem1gib: "memory = 1073741824.B"
mem2gib: "memory = 2147483648.B"
mem4gib: "memory = 4294967296.B"
mem8gib: "memory = 8589934592.B"
mem16gib: "memory = 17179869184.B"
mem32gib: "memory = 34359738368.B"
mem64gib: "memory = 68719476736.B"
mem128gib: "memory = 137438953472.B"
mem256gib: "memory = 274877906944.B"
mem512gib: "memory = 549755813888.B"
mem1tib: "memory = 1099511627776.B"
mem2tib: "memory = 2199023255552.B"
mem4tib: "memory = 4398046511104.B"
mem8tib: "memory = 8796093022208.B"
mem16tib: "memory = 17592186044416.B"
mem32tib: "memory = 35184372088832.B"
mem64tib: "memory = 70368744177664.B"
mem128tib: "memory = 140737488355328.B"
mem256tib: "memory = 281474976710656.B"
mem512tib: "memory = 562949953421312.B"
cpu1: "cpus = 1"
cpu2: "cpus = 2"
cpu5: "cpus = 5"
cpu10: "cpus = 10"
cpu20: "cpus = 20"
cpu50: "cpus = 50"
cpu100: "cpus = 100"
cpu200: "cpus = 200"
cpu500: "cpus = 500"
cpu1000: "cpus = 1000"
script:
- "includeConfig(\"nextflow_labels.config\")"
debug: false
container: "docker"
engines:
- type: "docker"
id: "docker"
image: "python:3.10-slim"
target_registry: "images.viash-hub.com"
target_tag: "fix-integration-tests"
namespace_separator: "/"
setup:
- type: "apt"
packages:
- "libhdf5-dev"
- "procps"
interactive: false
- type: "python"
user: false
packages:
- "scanpy~=1.9.6"
upgrade: true
- type: "python"
user: false
packages:
- "celltypist==1.6.3"
upgrade: true
- type: "python"
user: false
packages:
- "anndata==0.10.8"
- "mudata~=0.2.4"
- "pandas!=2.1.2"
- "numpy<2.0.0"
upgrade: true
test_setup:
- type: "docker"
copy:
- "openpipelinetestutils /opt/openpipelinetestutils"
- type: "python"
user: false
packages:
- "/opt/openpipelinetestutils"
upgrade: true
- type: "python"
user: false
packages:
- "viashpy==0.8.0"
upgrade: true
entrypoint: []
cmd: null
- type: "native"
id: "native"
build_info:
config: "src/annotate/celltypist/config.vsh.yaml"
runner: "executable"
engine: "docker|native"
output: "target/executable/annotate/celltypist"
executable: "target/executable/annotate/celltypist/celltypist"
viash_version: "0.9.0"
git_commit: "da62b4ffe30b6ef36fcb7ef5944f29d45d1138ff"
git_remote: "https://x-access-token:ghs_WgbTvvspBKMSQ5BPucz45vMfHrxqK54Vys9e@github.com/openpipelines-bio/openpipeline"
git_tag: "0.2.0-1939-gda62b4ff"
package_config:
name: "openpipeline"
version: "fix-integration-tests"
info:
test_resources:
- type: "s3"
path: "s3://openpipelines-data"
dest: "resources_test"
viash_version: "0.9.0"
source: "src"
target: "target"
config_mods:
- ".test_resources += {path: '/src/base/openpipelinetestutils', dest: 'openpipelinetestutils'}\n\
.resources += {path: '/src/workflows/utils/labels.config', dest: 'nextflow_labels.config'}\n\
.runners[.type == 'nextflow'].directives.tag := '$id'\n.runners[.type == 'nextflow'].config.script\
\ := 'includeConfig(\"nextflow_labels.config\")'\n"
- ".engines += { type: \"native\" }"
- ".engines[.type == 'docker'].target_registry := 'images.viash-hub.com'"
- ".engines[.type == 'docker'].target_tag := 'fix-integration-tests'"
organization: "vsh"
links:
repository: "https://github.com/openpipelines-bio/openpipeline"
docker_registry: "ghcr.io"
homepage: "https://openpipelines.bio"
documentation: "https://openpipelines.bio/fundamentals"
issue_tracker: "https://github.com/openpipelines-bio/openpipeline/issues"

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,42 @@
process {
// Default resources for components that hardly do any processing
memory = { 2.GB * task.attempt }
cpus = 1
// Retry for exit codes that have something to do with memory issues
errorStrategy = { task.exitStatus in 137..140 ? 'retry' : 'terminate' }
maxRetries = 3
maxMemory = null
// Resource labels
withLabel: singlecpu { cpus = 1 }
withLabel: lowcpu { cpus = 4 }
withLabel: midcpu { cpus = 10 }
withLabel: highcpu { cpus = 20 }
withLabel: lowmem { memory = { get_memory( 4.GB * task.attempt ) } }
withLabel: midmem { memory = { get_memory( 25.GB * task.attempt ) } }
withLabel: highmem { memory = { get_memory( 50.GB * task.attempt ) } }
withLabel: veryhighmem { memory = { get_memory( 75.GB * task.attempt ) } }
}
def get_memory(to_compare) {
if (!process.containsKey("maxMemory") || !process.maxMemory) {
return to_compare
}
try {
if (process.containsKey("maxRetries") && process.maxRetries && task.attempt == (process.maxRetries as int)) {
return process.maxMemory
}
else if (to_compare.compareTo(process.maxMemory as nextflow.util.MemoryUnit) == 1) {
return max_memory as nextflow.util.MemoryUnit
}
else {
return to_compare
}
} catch (all) {
println "Error processing memory resources. Please check that process.maxMemory '${process.maxMemory}' and process.maxRetries '${process.maxRetries}' are valid!"
System.exit(1)
}
}

View File

@@ -0,0 +1,12 @@
def setup_logger():
import logging
from sys import stdout
logger = logging.getLogger()
logger.setLevel(logging.INFO)
console_handler = logging.StreamHandler(stdout)
logFormatter = logging.Formatter("%(asctime)s %(levelname)-8s %(message)s")
console_handler.setFormatter(logFormatter)
logger.addHandler(console_handler)
return logger

View File

@@ -0,0 +1,367 @@
name: "onclass"
namespace: "annotate"
version: "fix-integration-tests"
authors:
- name: "Jakub Majercik"
roles:
- "author"
info:
role: "Contributor"
links:
email: "jakub@data-intuitive.com"
github: "jakubmajercik"
linkedin: "jakubmajercik"
organizations:
- name: "Data Intuitive"
href: "https://www.data-intuitive.com"
role: "Bioinformatics Engineer"
argument_groups:
- name: "Inputs"
description: "Input dataset (query) arguments"
arguments:
- type: "file"
name: "--input"
alternatives:
- "-i"
description: "The input (query) data to be labeled. Should be a .h5mu file."
info: null
example:
- "input.h5mu"
must_exist: true
create_parent: true
required: true
direction: "input"
multiple: false
multiple_sep: ";"
- type: "string"
name: "--modality"
description: "Which modality to process."
info: null
default:
- "rna"
required: false
direction: "input"
multiple: false
multiple_sep: ";"
- type: "string"
name: "--input_layer"
description: "The layer in the input data to be used for cell type annotation\
\ if .X is not to be used."
info: null
required: false
direction: "input"
multiple: false
multiple_sep: ";"
- type: "file"
name: "--cl_nlp_emb_file"
description: "The .nlp.emb file with the cell type embeddings."
info: null
must_exist: true
create_parent: true
required: true
direction: "input"
multiple: false
multiple_sep: ";"
- type: "file"
name: "--cl_ontology_file"
description: "The .ontology file with the cell type ontology."
info: null
must_exist: true
create_parent: true
required: true
direction: "input"
multiple: false
multiple_sep: ";"
- type: "file"
name: "--cl_obo_file"
description: "The .obo file with the cell type ontology."
info: null
must_exist: true
create_parent: true
required: true
direction: "input"
multiple: false
multiple_sep: ";"
- type: "string"
name: "--var_query_gene_names"
description: "The name of the adata var column in the input data containing gene\
\ names; when no gene_name_layer is provided, the var index will be used.\n"
info: null
required: false
direction: "input"
multiple: false
multiple_sep: ";"
- name: "Reference"
description: "Arguments related to the reference dataset."
arguments:
- type: "file"
name: "--reference"
description: "The reference data to train the CellTypist classifiers on. Only\
\ required if a pre-trained --model is not provided."
info: null
example:
- "reference.h5mu"
must_exist: true
create_parent: true
required: false
direction: "input"
multiple: false
multiple_sep: ";"
- type: "string"
name: "--reference_layer"
description: "The layer in the reference data to be used for cell type annotation\
\ if .X is not to be used."
info: null
required: false
direction: "input"
multiple: false
multiple_sep: ";"
- type: "string"
name: "--reference_obs_target"
description: "The name of the adata obs column in the reference data containing\
\ cell type annotations."
info: null
example:
- "cell_ontology_class"
required: true
direction: "input"
multiple: false
multiple_sep: ";"
- name: "Outputs"
description: "Output arguments."
arguments:
- type: "file"
name: "--output"
description: "Output h5mu file."
info: null
example:
- "output.h5mu"
must_exist: true
create_parent: true
required: false
direction: "output"
multiple: false
multiple_sep: ";"
- type: "string"
name: "--output_compression"
info: null
example:
- "gzip"
required: false
choices:
- "gzip"
- "lzf"
direction: "input"
multiple: false
multiple_sep: ";"
- type: "string"
name: "--output_obs_predictions"
description: "In which `.obs` slots to store the predicted information.\n"
info: null
default:
- "onclass_pred"
required: false
direction: "input"
multiple: false
multiple_sep: ";"
- type: "string"
name: "--output_obs_probability"
description: "In which `.obs` slots to store the probability of the predictions.\n"
info: null
default:
- "onclass_prob"
required: false
direction: "input"
multiple: false
multiple_sep: ";"
- name: "Model arguments"
description: "Model arguments"
arguments:
- type: "string"
name: "--model"
description: "\"Pretrained model path without a file extension. If not provided,\
\ the model will be trained \non the reference data and --reference should be\
\ provided. The path namespace should contain:\n - a .npz or .pkl file\n -\
\ a .data file\n - a .meta file\n - a .index file\ne.g. /path/to/model/pretrained_model_target1\
\ as saved by OnClass.\"\n"
info: null
required: false
direction: "input"
multiple: false
multiple_sep: ";"
- type: "integer"
name: "--max_iter"
description: "Maximum number of iterations for training the model."
info: null
default:
- 30
required: false
direction: "input"
multiple: false
multiple_sep: ";"
resources:
- type: "python_script"
path: "script.py"
is_executable: true
- type: "file"
path: "setup_logger.py"
- type: "file"
path: "nextflow_labels.config"
dest: "nextflow_labels.config"
description: "OnClass is a python package for single-cell cell type annotation. It\
\ uses the Cell Ontology to capture the cell type similarity. \nThese similarities\
\ enable OnClass to annotate cell types that are never seen in the training data.\n"
test_resources:
- type: "python_script"
path: "test.py"
is_executable: true
- type: "file"
path: "annotation_test_data"
- type: "file"
path: "pbmc_1k_protein_v3"
- type: "file"
path: "openpipelinetestutils"
dest: "openpipelinetestutils"
info: null
status: "enabled"
links:
repository: "https://github.com/openpipelines-bio/openpipeline"
docker_registry: "ghcr.io"
runners:
- type: "executable"
id: "executable"
docker_setup_strategy: "ifneedbepullelsecachedbuild"
- type: "nextflow"
id: "nextflow"
directives:
tag: "$id"
auto:
simplifyInput: true
simplifyOutput: false
transcript: false
publish: false
config:
labels:
mem1gb: "memory = 1000000000.B"
mem2gb: "memory = 2000000000.B"
mem5gb: "memory = 5000000000.B"
mem10gb: "memory = 10000000000.B"
mem20gb: "memory = 20000000000.B"
mem50gb: "memory = 50000000000.B"
mem100gb: "memory = 100000000000.B"
mem200gb: "memory = 200000000000.B"
mem500gb: "memory = 500000000000.B"
mem1tb: "memory = 1000000000000.B"
mem2tb: "memory = 2000000000000.B"
mem5tb: "memory = 5000000000000.B"
mem10tb: "memory = 10000000000000.B"
mem20tb: "memory = 20000000000000.B"
mem50tb: "memory = 50000000000000.B"
mem100tb: "memory = 100000000000000.B"
mem200tb: "memory = 200000000000000.B"
mem500tb: "memory = 500000000000000.B"
mem1gib: "memory = 1073741824.B"
mem2gib: "memory = 2147483648.B"
mem4gib: "memory = 4294967296.B"
mem8gib: "memory = 8589934592.B"
mem16gib: "memory = 17179869184.B"
mem32gib: "memory = 34359738368.B"
mem64gib: "memory = 68719476736.B"
mem128gib: "memory = 137438953472.B"
mem256gib: "memory = 274877906944.B"
mem512gib: "memory = 549755813888.B"
mem1tib: "memory = 1099511627776.B"
mem2tib: "memory = 2199023255552.B"
mem4tib: "memory = 4398046511104.B"
mem8tib: "memory = 8796093022208.B"
mem16tib: "memory = 17592186044416.B"
mem32tib: "memory = 35184372088832.B"
mem64tib: "memory = 70368744177664.B"
mem128tib: "memory = 140737488355328.B"
mem256tib: "memory = 281474976710656.B"
mem512tib: "memory = 562949953421312.B"
cpu1: "cpus = 1"
cpu2: "cpus = 2"
cpu5: "cpus = 5"
cpu10: "cpus = 10"
cpu20: "cpus = 20"
cpu50: "cpus = 50"
cpu100: "cpus = 100"
cpu200: "cpus = 200"
cpu500: "cpus = 500"
cpu1000: "cpus = 1000"
script:
- "includeConfig(\"nextflow_labels.config\")"
debug: false
container: "docker"
engines:
- type: "docker"
id: "docker"
image: "python:3.8"
target_registry: "images.viash-hub.com"
target_tag: "fix-integration-tests"
namespace_separator: "/"
setup:
- type: "python"
user: false
packages:
- "scikit-learn==0.24.0"
- "OnClass==1.2"
- "tensorflow==2.13.1"
- "obonet==1.1.0"
- "mudata"
upgrade: true
test_setup:
- type: "docker"
copy:
- "openpipelinetestutils /opt/openpipelinetestutils"
- type: "python"
user: false
packages:
- "/opt/openpipelinetestutils"
upgrade: true
- type: "python"
user: false
packages:
- "viashpy==0.8.0"
upgrade: true
entrypoint: []
cmd: null
- type: "native"
id: "native"
build_info:
config: "src/annotate/onclass/config.vsh.yaml"
runner: "executable"
engine: "docker|native"
output: "target/executable/annotate/onclass"
executable: "target/executable/annotate/onclass/onclass"
viash_version: "0.9.0"
git_commit: "da62b4ffe30b6ef36fcb7ef5944f29d45d1138ff"
git_remote: "https://x-access-token:ghs_WgbTvvspBKMSQ5BPucz45vMfHrxqK54Vys9e@github.com/openpipelines-bio/openpipeline"
git_tag: "0.2.0-1939-gda62b4ff"
package_config:
name: "openpipeline"
version: "fix-integration-tests"
info:
test_resources:
- type: "s3"
path: "s3://openpipelines-data"
dest: "resources_test"
viash_version: "0.9.0"
source: "src"
target: "target"
config_mods:
- ".test_resources += {path: '/src/base/openpipelinetestutils', dest: 'openpipelinetestutils'}\n\
.resources += {path: '/src/workflows/utils/labels.config', dest: 'nextflow_labels.config'}\n\
.runners[.type == 'nextflow'].directives.tag := '$id'\n.runners[.type == 'nextflow'].config.script\
\ := 'includeConfig(\"nextflow_labels.config\")'\n"
- ".engines += { type: \"native\" }"
- ".engines[.type == 'docker'].target_registry := 'images.viash-hub.com'"
- ".engines[.type == 'docker'].target_tag := 'fix-integration-tests'"
organization: "vsh"
links:
repository: "https://github.com/openpipelines-bio/openpipeline"
docker_registry: "ghcr.io"
homepage: "https://openpipelines.bio"
documentation: "https://openpipelines.bio/fundamentals"
issue_tracker: "https://github.com/openpipelines-bio/openpipeline/issues"

View File

@@ -0,0 +1,42 @@
process {
// Default resources for components that hardly do any processing
memory = { 2.GB * task.attempt }
cpus = 1
// Retry for exit codes that have something to do with memory issues
errorStrategy = { task.exitStatus in 137..140 ? 'retry' : 'terminate' }
maxRetries = 3
maxMemory = null
// Resource labels
withLabel: singlecpu { cpus = 1 }
withLabel: lowcpu { cpus = 4 }
withLabel: midcpu { cpus = 10 }
withLabel: highcpu { cpus = 20 }
withLabel: lowmem { memory = { get_memory( 4.GB * task.attempt ) } }
withLabel: midmem { memory = { get_memory( 25.GB * task.attempt ) } }
withLabel: highmem { memory = { get_memory( 50.GB * task.attempt ) } }
withLabel: veryhighmem { memory = { get_memory( 75.GB * task.attempt ) } }
}
def get_memory(to_compare) {
if (!process.containsKey("maxMemory") || !process.maxMemory) {
return to_compare
}
try {
if (process.containsKey("maxRetries") && process.maxRetries && task.attempt == (process.maxRetries as int)) {
return process.maxMemory
}
else if (to_compare.compareTo(process.maxMemory as nextflow.util.MemoryUnit) == 1) {
return max_memory as nextflow.util.MemoryUnit
}
else {
return to_compare
}
} catch (all) {
println "Error processing memory resources. Please check that process.maxMemory '${process.maxMemory}' and process.maxRetries '${process.maxRetries}' are valid!"
System.exit(1)
}
}

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,12 @@
def setup_logger():
import logging
from sys import stdout
logger = logging.getLogger()
logger.setLevel(logging.INFO)
console_handler = logging.StreamHandler(stdout)
logFormatter = logging.Formatter("%(asctime)s %(levelname)-8s %(message)s")
console_handler.setFormatter(logFormatter)
logger.addHandler(console_handler)
return logger

View File

@@ -0,0 +1,387 @@
name: "popv"
namespace: "annotate"
version: "fix-integration-tests"
authors:
- name: "Matthias Beyens"
roles:
- "author"
info:
role: "Contributor"
links:
github: "MatthiasBeyens"
orcid: "0000-0003-3304-0706"
email: "matthias.beyens@gmail.com"
linkedin: "mbeyens"
organizations:
- name: "Janssen Pharmaceuticals"
href: "https://www.janssen.com"
role: "Principal Scientist"
- name: "Robrecht Cannoodt"
roles:
- "author"
info:
role: "Core Team Member"
links:
email: "robrecht@data-intuitive.com"
github: "rcannood"
orcid: "0000-0003-3641-729X"
linkedin: "robrechtcannoodt"
organizations:
- name: "Data Intuitive"
href: "https://www.data-intuitive.com"
role: "Data Science Engineer"
- name: "Open Problems"
href: "https://openproblems.bio"
role: "Core Member"
argument_groups:
- name: "Inputs"
description: "Arguments related to the input (aka query) dataset."
arguments:
- type: "file"
name: "--input"
alternatives:
- "-i"
description: "Input h5mu file."
info: null
example:
- "input.h5mu"
must_exist: true
create_parent: true
required: true
direction: "input"
multiple: false
multiple_sep: ";"
- type: "string"
name: "--modality"
description: "Which modality to process."
info: null
default:
- "rna"
required: false
direction: "input"
multiple: false
multiple_sep: ";"
- type: "string"
name: "--input_layer"
description: "Which layer to use. If no value is provided, the counts are assumed\
\ to be in the `.X` slot. Otherwise, count data is expected to be in `.layers[input_layer]`."
info: null
required: false
direction: "input"
multiple: false
multiple_sep: ";"
- type: "string"
name: "--input_obs_batch"
description: "Key in obs field of input adata for batch information. If no value\
\ is provided, batch label is assumed to be unknown."
info: null
required: false
direction: "input"
multiple: false
multiple_sep: ";"
- type: "string"
name: "--input_var_subset"
description: "Subset the input object with this column."
info: null
required: false
direction: "input"
multiple: false
multiple_sep: ";"
- type: "string"
name: "--input_obs_label"
description: "Key in obs field of input adata for label information. This is only\
\ used for training scANVI. Unlabelled cells should be set to `\"unknown_celltype_label\"\
`."
info: null
required: false
direction: "input"
multiple: false
multiple_sep: ";"
- type: "string"
name: "--unknown_celltype_label"
description: "If `input_obs_label` is specified, cells with this value will be\
\ treated as unknown and will be predicted by the model."
info: null
default:
- "unknown"
required: false
direction: "input"
multiple: false
multiple_sep: ";"
- name: "Reference"
description: "Arguments related to the reference dataset."
arguments:
- type: "file"
name: "--reference"
description: "User-provided reference tissue. The data that will be used as reference\
\ to call cell types."
info: null
example:
- "TS_Bladder_filtered.h5ad"
must_exist: true
create_parent: true
required: true
direction: "input"
multiple: false
multiple_sep: ";"
- type: "string"
name: "--reference_layer"
description: "Which layer to use. If no value is provided, the counts are assumed\
\ to be in the `.X` slot. Otherwise, count data is expected to be in `.layers[reference_layer]`."
info: null
required: false
direction: "input"
multiple: false
multiple_sep: ";"
- type: "string"
name: "--reference_obs_label"
description: "Key in obs field of reference AnnData with cell-type information."
info: null
default:
- "cell_ontology_class"
required: false
direction: "input"
multiple: false
multiple_sep: ";"
- type: "string"
name: "--reference_obs_batch"
description: "Key in obs field of input adata for batch information."
info: null
default:
- "donor_assay"
required: false
direction: "input"
multiple: false
multiple_sep: ";"
- name: "Outputs"
description: "Output arguments."
arguments:
- type: "file"
name: "--output"
description: "Output h5mu file."
info: null
example:
- "output.h5mu"
must_exist: true
create_parent: true
required: true
direction: "output"
multiple: false
multiple_sep: ";"
- type: "string"
name: "--output_compression"
info: null
example:
- "gzip"
required: false
choices:
- "gzip"
- "lzf"
direction: "input"
multiple: false
multiple_sep: ";"
- name: "Arguments"
description: "Other arguments."
arguments:
- type: "string"
name: "--methods"
description: "Methods to call cell types. By default, runs to knn_on_scvi and\
\ scanvi."
info: null
example:
- "knn_on_scvi"
- "scanvi"
required: true
choices:
- "celltypist"
- "knn_on_bbknn"
- "knn_on_scanorama"
- "knn_on_scvi"
- "onclass"
- "rf"
- "scanvi"
- "svm"
direction: "input"
multiple: true
multiple_sep: ";"
resources:
- type: "python_script"
path: "script.py"
is_executable: true
- type: "file"
path: "setup_logger.py"
- type: "file"
path: "nextflow_labels.config"
dest: "nextflow_labels.config"
description: "Performs popular major vote cell typing on single cell sequence data\
\ using multiple algorithms. Note that this is a one-shot version of PopV."
test_resources:
- type: "python_script"
path: "test.py"
is_executable: true
- type: "file"
path: "annotation_test_data"
- type: "file"
path: "pbmc_1k_protein_v3"
- type: "file"
path: "openpipelinetestutils"
dest: "openpipelinetestutils"
info: null
status: "enabled"
links:
repository: "https://github.com/openpipelines-bio/openpipeline"
docker_registry: "ghcr.io"
runners:
- type: "executable"
id: "executable"
docker_setup_strategy: "ifneedbepullelsecachedbuild"
- type: "nextflow"
id: "nextflow"
directives:
label:
- "highmem"
- "highcpu"
tag: "$id"
auto:
simplifyInput: true
simplifyOutput: false
transcript: false
publish: false
config:
labels:
mem1gb: "memory = 1000000000.B"
mem2gb: "memory = 2000000000.B"
mem5gb: "memory = 5000000000.B"
mem10gb: "memory = 10000000000.B"
mem20gb: "memory = 20000000000.B"
mem50gb: "memory = 50000000000.B"
mem100gb: "memory = 100000000000.B"
mem200gb: "memory = 200000000000.B"
mem500gb: "memory = 500000000000.B"
mem1tb: "memory = 1000000000000.B"
mem2tb: "memory = 2000000000000.B"
mem5tb: "memory = 5000000000000.B"
mem10tb: "memory = 10000000000000.B"
mem20tb: "memory = 20000000000000.B"
mem50tb: "memory = 50000000000000.B"
mem100tb: "memory = 100000000000000.B"
mem200tb: "memory = 200000000000000.B"
mem500tb: "memory = 500000000000000.B"
mem1gib: "memory = 1073741824.B"
mem2gib: "memory = 2147483648.B"
mem4gib: "memory = 4294967296.B"
mem8gib: "memory = 8589934592.B"
mem16gib: "memory = 17179869184.B"
mem32gib: "memory = 34359738368.B"
mem64gib: "memory = 68719476736.B"
mem128gib: "memory = 137438953472.B"
mem256gib: "memory = 274877906944.B"
mem512gib: "memory = 549755813888.B"
mem1tib: "memory = 1099511627776.B"
mem2tib: "memory = 2199023255552.B"
mem4tib: "memory = 4398046511104.B"
mem8tib: "memory = 8796093022208.B"
mem16tib: "memory = 17592186044416.B"
mem32tib: "memory = 35184372088832.B"
mem64tib: "memory = 70368744177664.B"
mem128tib: "memory = 140737488355328.B"
mem256tib: "memory = 281474976710656.B"
mem512tib: "memory = 562949953421312.B"
cpu1: "cpus = 1"
cpu2: "cpus = 2"
cpu5: "cpus = 5"
cpu10: "cpus = 10"
cpu20: "cpus = 20"
cpu50: "cpus = 50"
cpu100: "cpus = 100"
cpu200: "cpus = 200"
cpu500: "cpus = 500"
cpu1000: "cpus = 1000"
script:
- "includeConfig(\"nextflow_labels.config\")"
debug: false
container: "docker"
engines:
- type: "docker"
id: "docker"
image: "python:3.9-slim"
target_registry: "images.viash-hub.com"
target_tag: "fix-integration-tests"
namespace_separator: "/"
setup:
- type: "apt"
packages:
- "procps"
- "git"
- "build-essential"
- "wget"
interactive: false
- type: "python"
user: false
packages:
- "scanpy~=1.9.6"
- "scvi-tools~=1.0.3"
- "popv~=0.3.2"
- "jax==0.4.10"
- "jaxlib==0.4.10"
- "ml-dtypes<0.3.0"
- "scipy==1.12.0"
upgrade: true
- type: "python"
user: false
packages:
- "anndata==0.10.8"
- "mudata~=0.2.4"
- "pandas!=2.1.2"
- "numpy<2.0.0"
upgrade: true
- type: "docker"
run:
- "cd /opt && git clone --depth 1 https://github.com/YosefLab/PopV.git && \\\n\
\ cd PopV && git fetch --depth 1 origin tag v0.2 && git checkout v0.2\n"
test_setup:
- type: "python"
user: false
packages:
- "viashpy==0.8.0"
upgrade: true
entrypoint: []
cmd: null
- type: "native"
id: "native"
build_info:
config: "src/annotate/popv/config.vsh.yaml"
runner: "executable"
engine: "docker|native"
output: "target/executable/annotate/popv"
executable: "target/executable/annotate/popv/popv"
viash_version: "0.9.0"
git_commit: "da62b4ffe30b6ef36fcb7ef5944f29d45d1138ff"
git_remote: "https://x-access-token:ghs_WgbTvvspBKMSQ5BPucz45vMfHrxqK54Vys9e@github.com/openpipelines-bio/openpipeline"
git_tag: "0.2.0-1939-gda62b4ff"
package_config:
name: "openpipeline"
version: "fix-integration-tests"
info:
test_resources:
- type: "s3"
path: "s3://openpipelines-data"
dest: "resources_test"
viash_version: "0.9.0"
source: "src"
target: "target"
config_mods:
- ".test_resources += {path: '/src/base/openpipelinetestutils', dest: 'openpipelinetestutils'}\n\
.resources += {path: '/src/workflows/utils/labels.config', dest: 'nextflow_labels.config'}\n\
.runners[.type == 'nextflow'].directives.tag := '$id'\n.runners[.type == 'nextflow'].config.script\
\ := 'includeConfig(\"nextflow_labels.config\")'\n"
- ".engines += { type: \"native\" }"
- ".engines[.type == 'docker'].target_registry := 'images.viash-hub.com'"
- ".engines[.type == 'docker'].target_tag := 'fix-integration-tests'"
organization: "vsh"
links:
repository: "https://github.com/openpipelines-bio/openpipeline"
docker_registry: "ghcr.io"
homepage: "https://openpipelines.bio"
documentation: "https://openpipelines.bio/fundamentals"
issue_tracker: "https://github.com/openpipelines-bio/openpipeline/issues"

View File

@@ -0,0 +1,42 @@
process {
// Default resources for components that hardly do any processing
memory = { 2.GB * task.attempt }
cpus = 1
// Retry for exit codes that have something to do with memory issues
errorStrategy = { task.exitStatus in 137..140 ? 'retry' : 'terminate' }
maxRetries = 3
maxMemory = null
// Resource labels
withLabel: singlecpu { cpus = 1 }
withLabel: lowcpu { cpus = 4 }
withLabel: midcpu { cpus = 10 }
withLabel: highcpu { cpus = 20 }
withLabel: lowmem { memory = { get_memory( 4.GB * task.attempt ) } }
withLabel: midmem { memory = { get_memory( 25.GB * task.attempt ) } }
withLabel: highmem { memory = { get_memory( 50.GB * task.attempt ) } }
withLabel: veryhighmem { memory = { get_memory( 75.GB * task.attempt ) } }
}
def get_memory(to_compare) {
if (!process.containsKey("maxMemory") || !process.maxMemory) {
return to_compare
}
try {
if (process.containsKey("maxRetries") && process.maxRetries && task.attempt == (process.maxRetries as int)) {
return process.maxMemory
}
else if (to_compare.compareTo(process.maxMemory as nextflow.util.MemoryUnit) == 1) {
return max_memory as nextflow.util.MemoryUnit
}
else {
return to_compare
}
} catch (all) {
println "Error processing memory resources. Please check that process.maxMemory '${process.maxMemory}' and process.maxRetries '${process.maxRetries}' are valid!"
System.exit(1)
}
}

Some files were not shown because too many files have changed in this diff Show More