Build branch openpipeline_qc/v0.1.0 with version v0.1.0 to openpipeline_qc on branch v0.1 (4de00a2)

Build pipeline: test-vsh-ci-build-template-8gzht

Source commit: 4de00a2614

Source message: release v0.1
This commit is contained in:
CI
2025-09-23 06:56:43 +00:00
commit f45b3c0cea
116 changed files with 57241 additions and 0 deletions

27
.gitignore vendored Normal file
View File

@@ -0,0 +1,27 @@
# IDEs and editors
/.idea
.project
.classpath
*.launch
.settings/
.vscode
# Temp
gitignore
test_results
# System Files
.DS_Store
Thumbs.db
# Nextflow
work
.nextflow*
trace*.txt
# viash
/resources_test/
# pycache
*__pycache__*

3
CHANGELOG.MD Normal file
View File

@@ -0,0 +1,3 @@
# openpipeline_qc x.x.x
Initial release containing a QC Reporting workflow for Xenium or CellRanger Multi ingested data, with corresponding components.

21
LICENSE Normal file
View File

@@ -0,0 +1,21 @@
MIT License
Copyright (c) 2025 openpipelines-bio
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in all
copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
SOFTWARE.

28
_viash.yaml Normal file
View File

@@ -0,0 +1,28 @@
viash_version: 0.9.4
version: v0.1.0
source: src
target: target
name: openpipeline_qc
organization: vsh
links:
repository: https://github.com/openpipelines-bio/openpipeline_qc
docker_registry: ghcr.io
repositories:
- name: openpipeline
repo: openpipelines-bio/openpipeline
type: github
tag: 2.1.2
- name: craftbox
repo: craftbox
type: vsh
tag: v0.2.0
info:
test_resources:
- type: s3
path: s3://openpipelines-bio/openpipeline_incubator/resources_test
dest: resources_test
config_mods: |
.requirements.commands := ['ps']
.runners[.type == 'nextflow'].directives.tag := '$id'
.resources += {path: '/src/configs/labels.config', dest: 'nextflow_labels.config'}
.runners[.type == 'nextflow'].config.script := 'includeConfig("nextflow_labels.config")'

0
main.nf Normal file
View File

0
nextflow.config Normal file
View File

View File

@@ -0,0 +1,166 @@
#/bin/bash
OUT_DIR=resources_test/qc_sample_data
OUT_DIR_SPATIAL=resources_test/spatial_qc_sample_data
[ ! -d "$OUT_DIR" ] && mkdir -p "$OUT_DIR"
[ ! -d "$OUT_DIR_SPATIAL" ] && mkdir -p "$OUT_DIR_SPATIAL"
# fetch/create h5mu from somewhere
cat > /tmp/params_create_h5mu.yaml <<EOF
param_list:
- id: sample_one
input_id: sample_one
input: s3://openpipelines-data/10x_5k_anticmv/5k_human_antiCMV_T_TBNK_connect_qc.h5mu
- id: sample_two
input_id: sample_two
input: s3://openpipelines-data/10x_5k_anticmv/5k_human_antiCMV_T_TBNK_connect_qc.h5mu
output: '\$id.qc.h5mu'
output_compression: gzip
publish_dir: "$OUT_DIR"
EOF
# add the sample ID to the mudata object
nextflow run openpipelines-bio/openpipeline \
-latest \
-r 2.1.2 \
-main-script target/nextflow/metadata/add_id/main.nf \
-c src/configs/labels_ci.config \
-profile docker \
-params-file /tmp/params_create_h5mu.yaml \
-resume
cat > /tmp/params_subset.yaml <<EOF
param_list:
- id: sample_one
input: resources_test/qc_sample_data/sample_one.qc.h5mu
- id: sample_two
input: resources_test/qc_sample_data/sample_two.qc.h5mu
output: '\$id.qc.h5mu'
number_of_observations: 10000
output_compression: gzip
publish_dir: "$OUT_DIR"
EOF
# subset h5mus
nextflow run openpipelines-bio/openpipeline \
-latest \
-r 2.1.2 \
-main-script target/nextflow/filter/subset_h5mu/main.nf \
-c src/configs/labels_ci.config \
-profile docker \
-params-file /tmp/params_subset.yaml \
-resume
cat > /tmp/add_metadata_obs.py <<EOF
import mudata as mu
import glob
import numpy as np
import pandas as pd
import os
# Directory containing the h5mu files
out_dir = "$(pwd)/resources_test/qc_sample_data"
# List of h5mu files
h5mu_files = glob.glob(os.path.join(out_dir, "*.h5mu"))
print(f"Found {len(h5mu_files)} h5mu files: {h5mu_files}")
# Metadata values to randomly assign
donor_ids = ["donor_1", "donor_2", "donor_3"]
cell_types = ["CD4+ T cell", "CD8+ T cell", "B cell", "NK cell", "Monocyte"]
batches = ["batch_A", "batch_B"]
conditions = ["treated", "control"]
for h5mu_file in h5mu_files:
print(f"Processing {h5mu_file}...")
# Load MuData object
mdata = mu.read_h5mu(h5mu_file)
rna = mdata.mod["rna"]
n_obs = rna.n_obs
# Generate random metadata
np.random.seed(42 + hash(h5mu_file) % 100) # Different seed for each file but reproducible
# Create metadata
rna.obs["donor_id"] = np.random.choice(donor_ids, size=n_obs)
rna.obs["cell_type"] = np.random.choice(cell_types, size=n_obs)
rna.obs["batch"] = np.random.choice(batches, size=n_obs)
rna.obs["condition"] = np.random.choice(conditions, size=n_obs)
# Add a continuous variable too
rna.obs["quality_score"] = np.random.uniform(0, 1, size=n_obs)
# Save the modified MuData object
mu.write_h5mu(h5mu_file, mdata)
print(f"Added metadata to {h5mu_file}")
print("All files processed successfully!")
EOF
# Execute the Python script
python /tmp/add_metadata_obs.py
# generate cellbender out for testing
cat > /tmp/params_cellbender.yaml <<EOF
param_list:
- id: sample_one
input: resources_test/qc_sample_data/sample_one.qc.h5mu
- id: sample_two
input: resources_test/qc_sample_data/sample_two.qc.h5mu
output: '\$id.qc.cellbender.h5mu'
epochs: 5
output_compression: gzip
publish_dir: "$OUT_DIR"
EOF
nextflow run openpipelines-bio/openpipeline \
-latest \
-r 2.1.2 \
-main-script target/nextflow/correction/cellbender_remove_background/main.nf \
-c src/configs/labels_ci.config \
-profile docker \
-params-file /tmp/params_cellbender.yaml \
-resume
# fetch spatial sample data from s3
aws s3 sync \
--profile di \
s3://openpipelines-bio/openpipeline_incubator/resources_test/spatial_qc_sample_data \
"$OUT_DIR_SPATIAL"
# generate json for testing
viash run src/ingestion_qc/h5mu_to_qc_json/config.vsh.yaml --engine docker -- \
--input "$OUT_DIR"/sample_one.qc.cellbender.h5mu \
--input "$OUT_DIR"/sample_two.qc.cellbender.h5mu \
--ingestion_method cellranger_multi \
--obs_metadata "donor_id;cell_type;batch;condition" \
--output "$OUT_DIR"/sc_dataset.json \
--output_reporting_json "$OUT_DIR"/sc_report_structure.json
viash run src/ingestion_qc/h5mu_to_qc_json/config.vsh.yaml --engine docker -- \
--input "$OUT_DIR_SPATIAL"/xenium_tiny.qc.h5mu \
--input "$OUT_DIR_SPATIAL"/xenium_tiny.qc.h5mu \
--ingestion_method xenium \
--min_num_nonzero_vars 1 \
--output "$OUT_DIR_SPATIAL"/xenium_dataset.json \
--output_reporting_json "$OUT_DIR_SPATIAL"/xenium_report_structure.json
# remove all state yaml files
rm "$OUT_DIR"/*.yaml
rm "$OUT_DIR_SPATIAL"/*.yaml
# copy to s3
aws s3 sync \
"$OUT_DIR" \
s3://openpipelines-bio/openpipeline_incubator/"$OUT_DIR" \
--delete \
--dryrun
aws s3 sync \
"$OUT_DIR_SPATIAL" \
s3://openpipelines-bio/openpipeline_incubator/"$OUT_DIR_SPATIAL" \
--delete \
--dryrun

View File

@@ -0,0 +1,37 @@
#/bin/bash
OUT_DIR=resources_test/spatial_qc_sample_data
[ ! -d "$OUT_DIR" ] && mkdir -p "$OUT_DIR"
# fetch/create h5mu from somewhere
cat > /tmp/qc.yaml <<EOF
param_list:
- id: xenium_tiny
input: s3://openpipelines-bio/openpipeline_spatial/resources_test/xenium/xenium_tiny.h5mu
- id: Lung5_Rep2_tiny
input: s3://openpipelines-bio/openpipeline_spatial/resources_test/cosmx/Lung5_Rep2_tiny.h5mu
var_name_mitochondrial_genes: mitochondrial
var_name_ribosomal_genes: ribosomal
output: '\$id.qc.h5mu'
output_compression: gzip
publish_dir: "$OUT_DIR"
EOF
nextflow run openpipelines-bio/openpipeline \
-latest \
-r 2.1.0 \
-main-script target/nextflow/workflows/qc/qc/main.nf \
-profile docker \
-params-file /tmp/qc.yaml \
-resume \
-config src/configs/labels_ci.config
# copy to s3
aws s3 sync \
--profile di \
resources_test/spatial_qc_sample_data \
s3://openpipelines-bio/openpipeline_incubator/resources_test/spatial_qc_sample_data \
--delete --dryrun \
--exclude "*" --include "*.h5mu" \

View File

@@ -0,0 +1,11 @@
name: Dorien Roosen
info:
role: Core Team Member
links:
email: dorien@data-intuitive.com
github: dorien-er
linkedin: dorien-roosen
organizations:
- name: Data Intuitive
href: https://www.data-intuitive.com
role: Data Scientist

View File

@@ -0,0 +1,11 @@
name: Jakub Majercik
info:
role: Contributor
links:
email: jakub@data-intuitive.com
github: jakubmajercik
linkedin: jakubmajercik
organizations:
- name: Data Intuitive
href: https://www.data-intuitive.com
role: Bioinformatics Engineer

View File

@@ -0,0 +1,15 @@
name: Robrecht Cannoodt
info:
role: Core Team Member
links:
email: robrecht@data-intuitive.com
github: rcannood
orcid: "0000-0003-3641-729X"
linkedin: robrechtcannoodt
organizations:
- name: Data Intuitive
href: https://www.data-intuitive.com
role: Data Science Engineer
- name: Open Problems
href: https://openproblems.bio
role: Core Member

View File

@@ -0,0 +1,6 @@
name: Weiwei Schultz
info:
role: Contributor
organizations:
- name: Janssen R&D US
role: Associate Director Data Sciences

View File

@@ -0,0 +1,2 @@
packages:
- anndata~=0.11.1

View File

@@ -0,0 +1,9 @@
__merge__: [/src/base/requirements/anndata.yaml, .]
packages:
- mudata~=0.3.1
# Make sure that awkward is not installed. Currently, support of awkward arrays
# in anndata is experimental, and it is enabled based on whether or not the package
# is available. By making sure that awkward is not installed, the functionality is
# not enabled.
script: |
exec("try:\n import awkward\nexcept ModuleNotFoundError:\n exit(0)\nelse: exit(1)")

View File

@@ -0,0 +1,2 @@
packages:
- viashpy==0.8.0

View File

@@ -0,0 +1,36 @@
profiles {
// detect tempdir
tempDir = java.nio.file.Paths.get(
System.getenv('NXF_TEMP') ?:
System.getenv('VIASH_TEMP') ?:
System.getenv('TEMPDIR') ?:
System.getenv('TMPDIR') ?:
'/tmp'
).toAbsolutePath()
mount_temp {
docker.temp = tempDir
podman.temp = tempDir
charliecloud.temp = tempDir
}
no_publish {
process {
withName: '.*' {
publishDir = [
enabled: false
]
}
}
}
docker {
docker.enabled = true
// docker.userEmulation = true
singularity.enabled = false
podman.enabled = false
shifter.enabled = false
charliecloud.enabled = false
}
}

66
src/configs/labels.config Normal file
View File

@@ -0,0 +1,66 @@
process {
// Default resources for components that hardly do any processing
memory = { 2.GB * task.attempt }
cpus = 1
// Retry for exit codes that have something to do with memory issues
errorStrategy = { task.exitStatus in 137..140 ? 'retry' : 'terminate' }
maxRetries = 3
maxMemory = null
// CPU resources
withLabel: singlecpu { cpus = 1 }
withLabel: lowcpu { cpus = 4 }
withLabel: midcpu { cpus = 10 }
withLabel: highcpu { cpus = 20 }
// Memory resources
withLabel: verylowmem { memory = { get_memory( 4.GB * task.attempt ) } }
withLabel: lowmem { memory = { get_memory( 8.GB * task.attempt ) } }
withLabel: midmem { memory = { get_memory( 16.GB * task.attempt ) } }
withLabel: highmem { memory = { get_memory( 64.GB * task.attempt ) } }
withLabel: veryhighmem { memory = { get_memory( 75.GB * task.attempt ) } }
// Disk space
withLabel: lowdisk {
disk = {process.disk ? process.disk : null}
}
withLabel: middisk {
disk = {process.disk ? process.disk : null}
}
withLabel: highdisk {
disk = {process.disk ? process.disk : null}
}
withLabel: veryhighdisk {
disk = {process.disk ? process.disk : null}
}
// NOTE: The above labels intentionally do not have an effect by default.
// The user should set the disk space requirements by adding the following
// to the compute environment:
//
// withLabel: lowdisk { disk = { 20.GB * task.attempt } }
// withLabel: middisk { disk = { 100.GB * task.attempt } }
// withLabel: highdisk { disk = { 200.GB * task.attempt } }
// withLabel: veryhighdisk { disk = { 500.GB * task.attempt } }
}
def get_memory(to_compare) {
if (!process.containsKey("maxMemory") || !process.maxMemory) {
return to_compare
}
try {
if (process.containsKey("maxRetries") && process.maxRetries && task.attempt == (process.maxRetries as int)) {
return process.maxMemory
}
else if (to_compare.compareTo(process.maxMemory as nextflow.util.MemoryUnit) == 1) {
return max_memory as nextflow.util.MemoryUnit
}
else {
return to_compare
}
} catch (all) {
println "Error processing memory resources. Please check that process.maxMemory '${process.maxMemory}' and process.maxRetries '${process.maxRetries}' are valid!"
System.exit(1)
}
}

View File

@@ -0,0 +1,105 @@
process {
withLabel: lowmem { memory = 13.Gb }
withLabel: lowcpu { cpus = 4 }
withLabel: midmem { memory = 13.Gb }
withLabel: midcpu { cpus = 4 }
withLabel: highmem { memory = 13.Gb }
withLabel: highcpu { cpus = 4 }
withLabel: veryhighmem { memory = 13.Gb }
withLabel: lowdisk {
disk = {process.disk ? process.disk : null}
}
withLabel: middisk {
disk = {process.disk ? process.disk : null}
}
withLabel: highdisk {
disk = {process.disk ? process.disk : null}
}
withLabel: veryhighdisk {
disk = {process.disk ? process.disk : null}
}
}
env.NUMBA_CACHE_DIR = '/tmp'
trace {
enabled = true
overwrite = true
}
dag {
overwrite = true
}
process.maxForks = 1
profiles {
// detect tempdir
tempDir = java.nio.file.Paths.get(
System.getenv('NXF_TEMP') ?:
System.getenv('VIASH_TEMP') ?:
System.getenv('TEMPDIR') ?:
System.getenv('TMPDIR') ?:
'/tmp'
).toAbsolutePath()
mount_temp {
docker.temp = tempDir
podman.temp = tempDir
charliecloud.temp = tempDir
}
no_publish {
process {
withName: '.*' {
publishDir = [
enabled: false
]
}
}
}
docker {
docker.fixOwnership = true
docker.enabled = true
singularity.enabled = false
podman.enabled = false
shifter.enabled = false
charliecloud.enabled = false
}
local {
// This config is for local processing.
process {
maxMemory = 25.GB
withLabel: verylowcpu { cpus = 2 }
withLabel: lowcpu { cpus = 4 }
withLabel: midcpu { cpus = 6 }
withLabel: highcpu { cpus = 12 }
withLabel: lowmem { memory = { get_memory( 8.GB * task.attempt ) } }
withLabel: midmem { memory = { get_memory( 12.GB * task.attempt ) } }
withLabel: highmem { memory = { get_memory( 20.GB * task.attempt ) } }
}
}
}
def get_memory(to_compare) {
if (!process.containsKey("maxMemory") || !process.maxMemory) {
return to_compare
}
try {
if (process.containsKey("maxRetries") && process.maxRetries && task.attempt == (process.maxRetries as int)) {
return process.maxMemory
}
else if (to_compare.compareTo(process.maxMemory as nextflow.util.MemoryUnit) == 1) {
return max_memory as nextflow.util.MemoryUnit
}
else {
return to_compare
}
} catch (all) {
println "Error processing memory resources. Please check that process.maxMemory '${process.maxMemory}' and process.maxRetries '${process.maxRetries}' are valid!"
System.exit(1)
}
}

View File

@@ -0,0 +1,67 @@
name: detect_ingestion_method
namespace: ingestion_qc
description: |
Detects the ingestion method of a dataset.
Currently detects either 10X CellRanger Multi, 10X Xenium or Nanostring CosMx, but can be extended to other technologies upon request.
authors:
- __merge__: /src/authors/dorien_roosen.yaml
roles: [author]
- __merge__: /src/authors/weiwei_schultz.yaml
roles: [contributor]
argument_groups:
- name: Inputs
arguments:
- name: --input
type: file
required: true
direction: input
description: The input h5mu file(s)
example: path/to/file.h5mu
- name: --modality
type: string
description: The modality to use
default: rna
- name: Outputs
arguments:
- name: --output_uns_ingestion_method
description: The .uns field in which to store the exprimental setup. Values stored are `cellranger_multi`, `xenium` or `cosmx`.
type: string
default: ingestion_method
- name: --output
type: file
required: true
direction: output
description: The output h5mu file, containing an .uns field with experiment description.
example: path/to/file.h5mu
resources:
- type: python_script
path: script.py
- path: /src/utils/setup_logger.py
test_resources:
- type: python_script
path: test.py
- path: /resources_test/qc_sample_data/sample_one.qc.h5mu
- path: /resources_test/spatial_qc_sample_data/Lung5_Rep2_tiny.qc.h5mu
- path: /resources_test/spatial_qc_sample_data/xenium_tiny.qc.h5mu
engines:
- type: docker
image: python:3.12-slim
setup:
- type: apt
packages:
- procps
- type: python
__merge__: [/src/base/requirements/anndata_mudata.yaml]
test_setup:
- type: apt
packages:
- git
- type: python
__merge__: [/src/base/requirements/viashpy.yaml]
github: openpipelines-bio/core#subdirectory=packages/python/openpipeline_testutils
runners:
- type: executable
- type: nextflow
directives:
label: [lowmem, lowdisk]

View File

@@ -0,0 +1,69 @@
import shutil
import anndata as ad
import h5py
import sys
## VIASH START
par = {
"input": "resources_test/qc_sample_data/sample_one.qc.h5mu",
# "input": "resources_test/spatial_qc_sample_data/xenium/xenium_tiny_qc.h5mu",
# "input": "/Users/dorienroosen/code/openpipeline_spatial/resources_test/cosmx/Lung5_Rep2_tiny.h5mu",
"output": "output.h5mu",
"output_uns_ingestion_method": "ingestion_method",
"modality": "rna"
}
meta = {
"resources_dir": "src/utils"
}
## VIASH END
sys.path.append(meta["resources_dir"])
from setup_logger import setup_logger
logger = setup_logger()
def main(par):
# read h5mu file
with h5py.File(par["input"], "r") as file:
mod = file["mod"][par["modality"]]
uns = ad.experimental.read_elem(file["uns"])
mod_obs = ad.experimental.read_elem(mod["obs"])
mod_uns = ad.experimental.read_elem(mod["uns"])
# detect ingestion method
ingestion_methods = {
"cellranger_multi": "metrics_cellranger" in uns,
"xenium": all(key in mod_obs for key in ["segmentation_method", "nucleus_area"]),
"cosmx": "spatial" in mod_uns and "fov" in mod_obs
}
# make sure only one ingestion method is detected
detected_methods = [method for method, detected in ingestion_methods.items() if detected]
methods_count = len(detected_methods)
if methods_count == 1:
detected_method = detected_methods[0]
logger.info(f"Detected ingestion method {detected_method}")
elif methods_count == 0:
raise ValueError("No ingestion method detected")
else:
raise ValueError(f"Multiple ingestion methods detected: {', '.join(detected_methods)}")
# check if mod_uns already contains a different detected method
if mod_uns.get(par["output_uns_ingestion_method"], detected_method) != detected_method:
raise ValueError(f"Field .uns['{par['output_uns_ingestion_method']}'] already exists and contains different value `{mod_uns.get(par['output_uns_ingestion_method'])}` than detected method (`{detected_method}`).")
# copy input to output
shutil.copy(par["input"], par["output"])
if par["output_uns_ingestion_method"] not in mod_uns:
with h5py.File (par["output"], "r+") as out_file:
out_file["uns"][par["output_uns_ingestion_method"]] = detected_method
if __name__ == "__main__":
main(par)

View File

@@ -0,0 +1,63 @@
import pytest
import h5py
import os
import anndata as ad
import sys
## VIASH START
meta = {
"resources_dir": "resources_test"
}
## VIASH END
def test_cellranger(run_component, tmp_path):
output = tmp_path / "output_cellranger.h5mu"
run_component(
[
"--input", meta["resources_dir"] + "/sample_one.qc.h5mu",
"--output", output
]
)
assert os.path.exists(output), "Output file was not created"
with h5py.File (output, "r") as out_file:
uns = ad.experimental.read_elem(out_file["uns"])
assert uns["ingestion_method"] == "cellranger_multi", "cellranger_multi not detected"
def test_xenium(run_component, tmp_path):
output = tmp_path / "output_xenium.h5mu"
run_component(
[
"--input", meta["resources_dir"] + "/xenium_tiny.qc.h5mu",
"--output", output
]
)
assert os.path.exists(output), "Output file was not created"
with h5py.File (output, "r") as out_file:
uns = ad.experimental.read_elem(out_file["uns"])
assert uns["ingestion_method"] == "xenium", "xenium not detected"
def test_cosmx(run_component, tmp_path):
output = tmp_path / "output_cosmx.h5mu"
run_component(
[
"--input", meta["resources_dir"] + "/Lung5_Rep2_tiny.qc.h5mu",
"--output", output
]
)
assert os.path.exists(output), "Output file was not created"
with h5py.File (output, "r") as out_file:
uns = ad.experimental.read_elem(out_file["uns"])
assert uns["ingestion_method"] == "cosmx", "cosmx not detected"
if __name__ == "__main__":
sys.exit(pytest.main([__file__]))

View File

@@ -0,0 +1,64 @@
name: generate_html
namespace: ingestion_qc
description: Generate an HTML report from the QC metrics
authors:
- __merge__: /src/authors/jakub_majercik.yaml
roles: [author]
- __merge__: /src/authors/dorien_roosen.yaml
roles: [author]
- __merge__: /src/authors/robrecht_cannoodt.yaml
roles: [author]
- __merge__: /src/authors/weiwei_schultz.yaml
roles: [contributor]
argument_groups:
- name: Inputs
arguments:
- name: --input_data
type: file
required: true
direction: input
description: The input JSON file containing the QC metrics
example: path/to/file.json
- name: --input_structure
type: file
required: true
direction: input
description: The input JSON file containing the structure of the data
example: path/to/file.json
- name: Outputs
arguments:
- name: --output_qc_report
type: file
required: true
direction: output
description: The output HTML report
example: path/to/file.html
resources:
- type: bash_script
path: script.sh
test_resources:
- type: bash_script
path: test.sh
- path: /resources_test/qc_sample_data/sc_dataset.json
- path: /resources_test/qc_sample_data/sc_report_structure.json
- path: /resources_test/spatial_qc_sample_data/xenium_dataset.json
- path: /resources_test/spatial_qc_sample_data/xenium_report_structure.json
engines:
- type: docker
image: node:latest
setup:
- type: apt
packages:
- git
- type: docker
run: |
npm install -g pnpm@latest-10 \
&& cd /opt && git clone -b v0.1.0 https://github.com/openpipelines-bio/siqc.git \
&& cd siqc && pnpm install \
&& true
runners:
- type: executable
- type: nextflow
directives:
label: [lowmem, lowdisk]

View File

@@ -0,0 +1,18 @@
ABSOLUTE_INPUT_DATA=$(realpath $par_input_data)
ABSOLUTE_INPUT_STRUCTURE=$(realpath $par_input_structure)
ABSOLUTE_OUTPUT=$(realpath $par_output_qc_report)
cd /opt/siqc
mkdir src/data
echo "Compressing input data..."
pnpm run compress_data "$ABSOLUTE_INPUT_DATA" "src/data/dataset.ts"
echo "Compressing report structure..."
pnpm run compress_data "$ABSOLUTE_INPUT_STRUCTURE" "src/data/report_structure.ts"
echo "Generating HTML..."
pnpm run build
echo "Copying HTML to output directory..."
cp dist/index.html "$ABSOLUTE_OUTPUT"

View File

@@ -0,0 +1,10 @@
echo ">> Generating report"
"$meta_executable" \
--input_data "$meta_resources_dir/sc_dataset.json" \
--input_structure "$meta_resources_dir/sc_report_structure.json" \
--output_qc_report "index.html" \
echo ">> Checking output"
[ ! -f "index.html" ] && echo "Error: Output report does not exist." && exit 1
echo ">> Test succesful" && exit 0

View File

@@ -0,0 +1,168 @@
name: h5mu_to_qc_json
namespace: ingestion_qc
scope: private
description: |
Takes H5MU files that have been ingested by CellRanger, Xenium or CosMx and processed by the QC workflow, and generates:
- A JSON file that contains the combined data for the QC report
- A JSON file that defines the layout and structure of the QC report
authors:
- __merge__: /src/authors/jakub_majercik.yaml
roles: [author]
- __merge__: /src/authors/dorien_roosen.yaml
roles: [author]
- __merge__: /src/authors/robrecht_cannoodt.yaml
roles: [author]
- __merge__: /src/authors/weiwei_schultz.yaml
roles: [contributor]
argument_groups:
- name: Inputs
arguments:
- name: --input
type: file
multiple: true
required: true
direction: input
description: The input h5mu file(s)
example: path/to/file.h5mu
- name: --modality
type: string
description: The modality to use
default: rna
- name: --ingestion_method
type: string
required: true
choices:
- cellranger_multi
- xenium
description: Method that was used to ingest the data - this will define the structure of the report that is generated.
- name: --obs_sample_id
type: string
description: The key in the h5mu file that contains the sample ID. If not provided, each H5MU file will be considered as a separate sample.
default: sample_id
- name: --obs_total_counts
type: string
description: The key in the h5mu .obs field that contains the total counts.
default: total_counts
- name: --obs_num_nonzero_vars
type: string
description: The key in the h5mu .obs field that contains the number of nonzero vars.
default: num_nonzero_vars
- name: --obs_fraction_mitochondrial
type: string
description: The key in the h5mu .obs field that contains the fraction mitochondrial genes.
default: fraction_mitochondrial
- name: --obs_fraction_ribosomal
type: string
description: The key in the h5mu .obs field that contains the fraction ribosomal genes.
default: fraction_ribosomal
- name: Outputs
arguments:
- name: --output
type: file
required: true
direction: output
description: The output JSON file
example: path/to/file.json
- name: --output_reporting_json
type: file
required: true
description: The output JSON file that defines the QC report
direction: output
example: path/to/file.json
- name: Filtering & grouping options
arguments:
- name: --min_total_counts
type: integer
description: Minimum total counts for a cell to be included in the output
default: 10
- name: --min_num_nonzero_vars
type: integer
description: Minimum number of nonzero vars for a cell to be included in the output
default: 10
- name: --obs_metadata
type: string
multiple: true
description: The metadata keys in the h5mu .obs to include in the output JSON.
example: "donor_id;cell_type;batch;condition"
- name: Options for CellRanger reports
arguments:
- name: --obs_cellbender
type: string
multiple: true
description: The cellbender keys in the h5mu .obs to include in the output JSON
default: [
"cellbender_background_fraction",
"cellbender_cell_probability",
"cellbender_cell_size",
"cellbender_droplet_efficiency"
]
- name: --uns_cellranger_metrics
type: string
description: The key in the h5mu file .uns that contains the cellranger metrics
default: metrics_cellranger
- name: Options for Xenium reports
arguments:
- name: --obs_nucleus_area
type: string
description: The key in the h5mu .obs field that contains the nucleus area.
default: nucleus_area
- name: --obs_cell_area
type: string
description: The key in the h5mu .obs field that contains the cell area.
default: cell_area
- name: --obs_x_coord
type: string
description: The key in the h5mu .obs field that contains the x coordinate.
default: x_coord
- name: --obs_y_coord
type: string
description: The key in the h5mu .obs field that contains the y coordinate.
default: y_coord
- name: --obs_control_probe_counts
type: string
description: The key in the h5mu .obs field that contains the number of control probes.
default: control_probe_counts
- name: --obs_control_codeword_counts
type: string
description: The key in the h5mu .obs field that contains the number of control codewords.
default: control_codeword_counts
# - name: Options for CosMx reports
resources:
- type: python_script
path: script.py
- path: /src/utils/setup_logger.py
- path: report_structure
test_resources:
- type: python_script
path: test.py
- type: file
path: /resources_test
engines:
- type: docker
image: python:3.12-slim
setup:
- type: apt
packages:
- procps
- type: python
__merge__: [ /src/base/requirements/anndata_mudata.yaml ]
test_setup:
- type: apt
packages:
- git
- type: python
__merge__: [/src/base/requirements/viashpy.yaml]
github: openpipelines-bio/core#subdirectory=packages/python/openpipeline_testutils
runners:
- type: executable
- type: nextflow
directives:
label: [midmem, middisk]

View File

@@ -0,0 +1,162 @@
{
"categories": [
{
"name": "Sample QC",
"key": "sample_summary_stats",
"additionalAxes": false,
"defaultFilters": []
},
{
"name": "SampleQC",
"key": "metrics_cellranger_stats",
"additionalAxes": false,
"defaultFilters": [
{
"type": "bar",
"field": "Number_of_reads_in_the_library",
"label": "Number of reads per library",
"description": "Sequencing depth per sample. Higher values generally indicate more comprehensive cell profiling.",
"nBins": 10,
"groupBy": "sample_id",
"xAxisType": "linear",
"yAxisType": "linear"
},
{
"type": "bar",
"field": "Confidently_mapped_reads_in_cells",
"label": "Confidently mapped reads in cells",
"description": "Number of reads that were mapped unambiguously to the reference genome within cell-containing droplets.",
"groupBy": "sample_id",
"nBins": 10,
"yAxisType": "linear"
},
{
"type": "bar",
"field": "Estimated_number_of_cells",
"label": "Estimated number of cells",
"description": "CellRanger's estimate of the number of cells per sample based on the UMI count distribution.",
"groupBy": "sample_id",
"nBins": 10,
"yAxisType": "linear"
},
{
"type": "bar",
"field": "Sequencing_saturation",
"label": "Sequencing saturation",
"description": "Fraction of reads that are duplicates of existing UMIs. Higher values suggest deeper sequencing coverage.",
"groupBy": "sample_id",
"nBins": 10,
"yAxisType": "linear"
}
]
},
{
"name": "Cell RNA QC",
"key": "cell_rna_stats",
"additionalAxes": true,
"defaultFilters": [
{
"type": "histogram",
"field": "total_counts",
"label": "Total UMI per cell",
"description": "Total number of RNA molecules detected per cell. Low values typically indicate empty droplets or low-quality cells that should be filtered out.",
"cutoffMin": null,
"cutoffMax": null,
"zoomMax": null,
"nBins": 50,
"groupBy": "sample_id",
"yAxisType": "linear"
},
{
"type": "histogram",
"field": "num_nonzero_vars",
"label": "Number of non-zero genes per cell",
"description": "Count of unique genes detected in each cell. Low gene counts often indicate poor-quality cells.",
"cutoffMin": null,
"cutoffMax": null,
"zoomMax": null,
"nBins": 50,
"groupBy": "sample_id",
"yAxisType": "linear"
},
{
"type": "histogram",
"field": "fraction_mitochondrial",
"label": "Fraction UMI of mitochondrial genes per cell",
"description": "Proportion of cell's RNA from mitochondrial genes.",
"cutoffMin": null,
"cutoffMax": null,
"nBins": 50,
"groupBy": "sample_id",
"yAxisType": "linear"
},
{
"type": "histogram",
"field": "fraction_ribosomal",
"label": "Fraction UMI of ribosomal genes per cell",
"description": "Proportion of cell's RNA from ribosomal protein genes. Extreme values may indicate stress responses or cell cycle abnormalities.",
"cutoffMin": null,
"cutoffMax": null,
"nBins": 50,
"groupBy": "sample_id",
"yAxisType": "linear"
},
{
"type": "histogram",
"field": "pct_of_counts_in_top_50_vars",
"label": "Fraction UMI in top 50 genes per cell",
"description": "Proportion of RNA molecules from the 50 most-expressed genes in each cell.",
"cutoffMin": null,
"cutoffMax": null,
"nBins": 50,
"groupBy": "sample_id",
"yAxisType": "linear"
},
{
"type": "histogram",
"field": "cellbender_cell_probability",
"label": "CellBender cell probability",
"description": "CellBender's statistical confidence (0-1) that a barcode represents a real cell, with higher values indicating stronger confidence.",
"cutoffMin": null,
"cutoffMax": null,
"nBins": 50,
"groupBy": "sample_id",
"yAxisType": "linear"
},
{
"type": "histogram",
"field": "cellbender_background_fraction",
"label": "CellBender background fraction",
"description": "Estimated percentage of each cell's RNA that comes from the ambient solution rather than the cell itself.",
"cutoffMin": null,
"cutoffMax": null,
"nBins": 50,
"groupBy": "sample_id",
"yAxisType": "linear"
},
{
"type": "histogram",
"field": "cellbender_cell_size",
"label": "CellBender cell size",
"description": "CellBender's estimate of the true number of RNA molecules in each cell after removing ambient contamination. Reflects actual cell RNA content rather than raw UMI counts.",
"cutoffMin": null,
"cutoffMax": null,
"nBins": 50,
"groupBy": "sample_id",
"yAxisType": "linear"
},
{
"type": "histogram",
"field": "cellbender_droplet_efficiency",
"label": "CellBender droplet efficiency",
"description": "CellBender's estimate of how efficiently each droplet captured RNA molecules. Higher values indicate more reliable RNA sampling within individual droplets.",
"cutoffMin": null,
"cutoffMax": null,
"nBins": 50,
"groupBy": "sample_id",
"yAxisType": "linear"
}
]
}
]
}

View File

@@ -0,0 +1,91 @@
{
"categories": [
{
"name": "Sample QC",
"key": "sample_summary_stats",
"additionalAxes": false,
"defaultFilters": []
},
{
"name": "Cell RNA QC",
"key": "cell_rna_stats",
"additionalAxes": true,
"defaultFilters": [
{
"type": "histogram",
"visualizationType": "histogram",
"field": "total_counts",
"label": "Total UMI per cell",
"description": "Total number of RNA molecules detected per cell. Low values typically indicate empty droplets or low-quality cells that should be filtered out.",
"cutoffMin": null,
"cutoffMax": null,
"zoomMax": null,
"nBins": 50,
"groupBy": "sample_id",
"yAxisType": "linear"
},
{
"type": "histogram",
"visualizationType": "histogram",
"field": "num_nonzero_vars",
"label": "Number of non-zero genes per cell",
"description": "Count of unique genes detected in each cell. Low gene counts often indicate poor-quality cells.",
"cutoffMin": null,
"cutoffMax": null,
"zoomMax": null,
"nBins": 50,
"groupBy": "sample_id",
"yAxisType": "linear"
},
{
"type": "histogram",
"visualizationType": "histogram",
"field": "fraction_mitochondrial",
"label": "Fraction UMI of mitochondrial genes per cell",
"description": "Proportion of cell's RNA from mitochondrial genes.",
"cutoffMin": null,
"cutoffMax": null,
"nBins": 50,
"groupBy": "sample_id",
"yAxisType": "linear"
},
{
"type": "histogram",
"visualizationType": "histogram",
"field": "fraction_ribosomal",
"label": "Fraction UMI of ribosomal genes per cell",
"description": "Proportion of cell's RNA from ribosomal protein genes. Extreme values may indicate stress responses or cell cycle abnormalities.",
"cutoffMin": null,
"cutoffMax": null,
"nBins": 50,
"groupBy": "sample_id",
"yAxisType": "linear"
},
{
"type": "histogram",
"visualizationType": "histogram",
"field": "cell_area",
"label": "Segmented cell area",
"description": "Area of the segmented cells.",
"cutoffMin": null,
"cutoffMax": null,
"nBins": 50,
"groupBy": "sample_id",
"yAxisType": "linear"
},
{
"type": "histogram",
"visualizationType": "histogram",
"field": "nucleus_ratio",
"label": "Nucleus Ratio",
"description": "Ratio of the nucleus area to the segmented cell area.",
"cutoffMin": null,
"cutoffMax": null,
"nBins": 50,
"groupBy": "sample_id",
"yAxisType": "linear"
}
]
}
]
}

View File

@@ -0,0 +1,385 @@
import json
import pandas as pd
from pathlib import Path
import anndata as ad
import h5py
import sys
import os
import shutil
## VIASH START
# inputs = list(Path("data/sample_data/sample_data").glob("*.h5mu"))
# output = "data/sample-data.json"
inputs = list(Path("resources_test_after_running_script/qc_sample_data").glob("*.qc.h5mu"))
output = "tmp.json"
par = {
"input": sorted([str(x) for x in inputs]),
# "input": ["resources_test/spatial_qc_sample_data/xenium_tiny.qc.h5mu", "resources_test/spatial_qc_sample_data/xenium_tiny.qc.h5mu"],
"output": "sc_data.json",
"output_reporting_json": "sc_report_structure.json",
"modality": "rna",
"ingestion_method": "cellranger_multi",
"obs_sample_id": "sample_id",
"obs_total_counts": "total_counts",
"obs_num_nonzero_vars": "num_nonzero_vars",
"obs_fraction_mitochondrial": "fraction_mitochondrial",
"obs_fraction_ribosomal": "fraction_ribosomal",
"min_total_counts": 20,
"min_num_nonzero_vars": 20,
"obs_cellbender": [
"cellbender_background_fraction",
"cellbender_cell_probability",
"cellbender_cell_size",
"cellbender_droplet_efficiency",
],
"uns_cellranger_metrics": "metrics_cellranger",
"obs_metadata": ["cell_type"],
"obs_nucleus_area": "nucleus_area",
"obs_cell_area": "cell_area",
"obs_x_coord": "x_coord",
"obs_y_coord": "y_coord",
"obs_control_probe_counts": "control_probe_counts",
"obs_control_codeword_counts": "control_codeword_counts"
}
meta = {
"resources_dir": os.path.abspath("src/ingestion_qc/h5mu_to_qc_json"),
}
i = 0
mudata_file = par["input"][i]
sys.path.append("src/utils")
## VIASH END
sys.path.append(meta["resources_dir"])
from setup_logger import setup_logger
logger = setup_logger()
par["obs_cellbender"] = {} if not par["obs_cellbender"] else par["obs_cellbender"]
def transform_df(df):
"""Transform a DataFrame into the annotation object format."""
columns = []
for name in df.columns:
data = df[name]
# Determine dtype
if pd.api.types.is_integer_dtype(data):
dtype = "integer"
elif pd.api.types.is_float_dtype(data):
dtype = "numeric"
elif pd.api.types.is_categorical_dtype(data):
dtype = "categorical"
else:
raise ValueError(f"Unknown/unsupported data type for column {name}")
column_info = {"name": name, "dtype": dtype}
if dtype == "categorical":
column_info["data"] = data.cat.codes.tolist()
column_info["categories"] = data.cat.categories.tolist()
else:
column_info["data"] = [None if pd.isna(x) else x for x in data]
columns.append(column_info)
return {"num_rows": len(df), "num_cols": len(df.columns), "min_total_counts": par["min_total_counts"], "min_num_nonzero_vars": par["min_num_nonzero_vars"], "columns": columns}
def check_optional_obs_keys(obs, keys, message):
missing_keys = [key for key in keys if key not in obs.columns]
if missing_keys:
logger.info(f"Missing keys in obs: {', '.join(missing_keys)}. {message}")
def transform_cellranger_metrics(uns, sample_id):
if not par["uns_cellranger_metrics"] in uns:
raise ValueError(f"Could not find cellranger metrics in uns: {par['uns_cellranger_metrics']}. Provide correct value for --uns_cellranger_metrics or make sure data was ingested using CellRanger multi.")
cellranger_metrics = (
uns[par["uns_cellranger_metrics"]]
.pivot_table(
index=[],
columns="Metric Name",
values="Metric Value",
aggfunc="first",
)
.reset_index(drop=True)
)
cellranger_metrics.columns.name = None
# Remove thousands separator and convert to numeric
cellranger_metrics = cellranger_metrics.map(
lambda x: (
pd.to_numeric(x.replace(",", ""), errors="coerce")
if isinstance(x, str)
else x
)
)
# Replace spaces with underscores in column names
cellranger_metrics.columns = cellranger_metrics.columns.str.replace(" ", "_")
for col in cellranger_metrics.columns:
cellranger_metrics[col] = pd.to_numeric(cellranger_metrics[col], errors="coerce")
cellranger_metrics["sample_id"] = [sample_id[0]]
return cellranger_metrics
def format_cellbender_columns(mod_obs):
# Check if celbender was run on the dataset
if par["obs_cellbender"]:
check_optional_obs_keys(mod_obs, par["obs_cellbender"], "Run cellbender first to include these metrics.")
cellbender_obs_keys = [column for column in par["obs_cellbender"] if column in mod_obs]
for key in cellbender_obs_keys:
if not pd.api.types.is_float_dtype(mod_obs[key]):
try:
mod_obs[key] = mod_obs[key].astype("float16")
except ValueError:
raise ValueError(f"Could not convert column {key} to a float dtype. Please make sure all cellbender metrics are numeric.")
return cellbender_obs_keys, mod_obs
def format_required_columns(required_keys, mod_obs):
for key in required_keys:
if not pd.api.types.is_numeric_dtype(mod_obs[key]):
raise ValueError(f"Column {key} must be a numeric dtype.")
if not pd.api.types.is_integer_dtype(mod_obs[par["obs_total_counts"]]):
logger.info(f"Converting {par['obs_total_counts']} from {mod_obs[par['obs_total_counts']].dtype} to integer dtype...")
mod_obs[par["obs_total_counts"]] = mod_obs[par["obs_total_counts"]].astype(int)
if not pd.api.types.is_integer_dtype(mod_obs[par["obs_num_nonzero_vars"]]):
logger.info(f"Converting {par['obs_num_nonzero_vars']} from {mod_obs[par['obs_num_nonzero_vars']].dtype} to integer dtype...")
mod_obs[par["obs_num_nonzero_vars"]] = mod_obs[par["obs_num_nonzero_vars"]].astype(int)
if not pd.api.types.is_float_dtype(mod_obs[par["obs_fraction_mitochondrial"]]):
logger.info(f"Converting {par['obs_fraction_mitochondrial']} from {mod_obs[par['obs_fraction_mitochondrial']].dtype} to float dtype...")
mod_obs[par["obs_fraction_mitochondrial"]] = mod_obs[par["obs_fraction_mitochondrial"]].astype("float16")
if not pd.api.types.is_float_dtype(mod_obs[par["obs_fraction_ribosomal"]]):
logger.info(f"Converting {par['obs_fraction_ribosomal']} from {mod_obs[par['obs_fraction_ribosomal']].dtype} to float dtype...")
mod_obs[par["obs_fraction_ribosomal"]] = mod_obs[par["obs_fraction_ribosomal"]].astype("float16")
return mod_obs
def format_categorical_columns(mod_obs):
# Fetch all categorical columns for grouping if no columns are provided
if not par["obs_metadata"]:
metadata_obs_keys = mod_obs.select_dtypes(include=["object", "category"]).columns.tolist()
if par["obs_sample_id"] in metadata_obs_keys:
metadata_obs_keys.remove(par["obs_sample_id"])
else:
check_optional_obs_keys(mod_obs, par["obs_metadata"], "Make sure requested metadata colmuns are present in obs.")
metadata_obs_keys = [key for key in par["obs_metadata"] if key in mod_obs]
for key in metadata_obs_keys:
if not isinstance(key, pd.CategoricalDtype):
logger.info(f"{key} is not a categorical dtype. Converting {key} from {mod_obs[key].dtype} to categorical dtype...")
mod_obs[key] = mod_obs[key].astype(str).astype("category")
return metadata_obs_keys, mod_obs
def generate_cellranger_stats(mod_obs, uns, sample_id, required_keys):
# Format required columns
mod_obs = format_required_columns(required_keys, mod_obs)
# Fetch and format all categorical columns for grouping
metadata_obs_keys, mod_obs = format_categorical_columns(mod_obs)
# Fetch and format cellbender columns
cellbender_obs_keys, mod_obs = format_cellbender_columns(mod_obs)
# Create cell RNA stats dataframe
cell_rna_stats = pd.DataFrame(
{
"sample_id": pd.Categorical(sample_id),
**{key: mod_obs[key] for key in required_keys},
**{key: mod_obs[key] for key in cellbender_obs_keys},
**{key: mod_obs[key] for key in metadata_obs_keys},
}
)
cellranger_stats = transform_cellranger_metrics(uns, sample_id)
return cell_rna_stats, cellranger_stats
def format_xenium_columns(mod_obs):
mod_obs["nucleus_ratio"] = mod_obs[par["obs_nucleus_area"]] / mod_obs[par["obs_cell_area"]]
xenium_formatted_columns = [par["obs_cell_area"], "nucleus_ratio", "x_coord", "y_coord"]
for key in xenium_formatted_columns:
mod_obs[key] = mod_obs[key].astype("float16")
return mod_obs, xenium_formatted_columns
def generate_xenium_stats(mod_obs, sample_id, required_keys):
# Format required columns
mod_obs = format_required_columns(required_keys, mod_obs)
# Format xenium-specific columns
mod_obs, xenium_formatted_columns = format_xenium_columns(mod_obs)
# Fetch and format all categorical columns for grouping
metadata_obs_keys, mod_obs = format_categorical_columns(mod_obs)
# Create cell RNA stats dataframe
cell_rna_stats = pd.DataFrame(
{
"sample_id": pd.Categorical(sample_id),
**{key: mod_obs[key] for key in required_keys},
**{key: mod_obs[key] for key in xenium_formatted_columns},
**{key: mod_obs[key] for key in metadata_obs_keys}
}
)
return cell_rna_stats
def concatenate_dataframes(dfs):
'''Concatenates a list of dataframes into a single dataframe, preserving categorical columns.'''
df = pd.concat(dfs, ignore_index=True)
# Find categorical columns that became object columms
for col in df.columns:
if any(df[col].dtype.name == 'category' for df in dfs if col in df.columns):
# Get all categorical series for this column
cat_series = [df[col] for df in dfs if col in df.columns and df[col].dtype.name == 'category']
if cat_series:
# Union the categories and apply to result
unioned = pd.api.types.union_categoricals(cat_series)
df[col] = pd.Categorical(df[col], categories=unioned.categories)
return df
def main(par):
cell_stats_dfs = []
sample_stats_dfs = []
metrics_cellranger_dfs = []
for i, mudata_file in enumerate(par["input"]):
logger.info(f"Processing {mudata_file}")
# read h5mu file
file = h5py.File(mudata_file, "r")
# read the necessary info
grp_mod = file["mod"][par["modality"]]
mod_obs = ad.experimental.read_elem(grp_mod["obs"])
mod_obsm = ad.experimental.read_elem(grp_mod["obsm"])
uns = ad.experimental.read_elem(file["uns"])
# close the h5mu file
file.close()
barcodes_original_count = mod_obs.shape[0]
# Add coordinates to obs before filtering
if par["ingestion_method"] == "xenium":
mod_obs["x_coord"] = mod_obsm["spatial"][:, 0]
mod_obs["y_coord"] = mod_obsm["spatial"][:, 1]
# Pre-filter cells
logger.info("Pre-filtering cells based on counts...")
if "min_total_counts" in par:
mod_obs = mod_obs[mod_obs["total_counts"] >= par["min_total_counts"]]
if "min_num_nonzero_vars" in par:
mod_obs = mod_obs[mod_obs["num_nonzero_vars"] >= par["min_num_nonzero_vars"]]
barcodes_filtered_count = mod_obs.shape[0]
# Detect sample id's
logger.info("Detecting sample id's...")
sample_id = (
mod_obs[par["obs_sample_id"]].tolist()
if par["obs_sample_id"] in mod_obs.columns
else [f"sample_{i}"] * mod_obs.shape[0]
)
# Generating sample summary statistics
logger.info("Generating sample summary statistics...")
required_keys = [
par["obs_total_counts"],
par["obs_num_nonzero_vars"],
par["obs_fraction_mitochondrial"],
par["obs_fraction_ribosomal"]
]
missing_keys = [key for key in required_keys if key not in mod_obs.columns]
if missing_keys:
raise ValueError(f"Missing keys in obs: {', '.join(missing_keys)}")
sample_summary = {
"sample_id": pd.Categorical([sample_id[0]]),
"rna_num_barcodes": [barcodes_original_count],
"rna_num_barcodes_filtered": [barcodes_filtered_count],
"rna_sum_total_counts": [mod_obs[par["obs_total_counts"]].sum()],
"rna_median_total_counts": [mod_obs[par["obs_total_counts"]].median()],
"rna_overall_num_nonzero_vars": [mod_obs[par["obs_num_nonzero_vars"]].sum()],
"rna_median_num_nonzero_vars": [mod_obs[par["obs_num_nonzero_vars"]].median()],
}
if par["ingestion_method"] == "xenium":
sample_summary["control_probe_percentage"] = mod_obs[par["obs_control_probe_counts"]].sum() / mod_obs["total_counts"].sum() * 100
sample_summary["negative_decoding_percentage"] = mod_obs[par["obs_control_codeword_counts"]].sum() / mod_obs["total_counts"].sum() * 100
sample_summary_stats = pd.DataFrame(sample_summary)
if par["ingestion_method"] == "cellranger_multi":
cell_rna_stats, cellranger_stats = generate_cellranger_stats(mod_obs, uns, sample_id, required_keys)
metrics_cellranger_dfs.append(cellranger_stats)
if par["ingestion_method"] == "xenium":
cell_rna_stats = generate_xenium_stats(mod_obs, sample_id, required_keys)
cell_stats_dfs.append(cell_rna_stats)
sample_stats_dfs.append(sample_summary_stats)
# Combine dataframes of all samples
logger.info("Combining data of all samples into single object...")
combined_cell_stats = concatenate_dataframes(cell_stats_dfs)
combined_sample_stats = concatenate_dataframes(sample_stats_dfs)
if par["ingestion_method"] == "cellranger_multi":
combined_metrics_cellranger = concatenate_dataframes(metrics_cellranger_dfs)
report_categories = [combined_cell_stats, combined_sample_stats]
if par["ingestion_method"] == "cellranger_multi":
report_categories.append(combined_metrics_cellranger)
for df in report_categories:
df["sample_id"] = pd.Categorical(df["sample_id"])
output = {
"cell_rna_stats": transform_df(combined_cell_stats),
"sample_summary_stats": transform_df(combined_sample_stats)
}
if par["ingestion_method"] == "cellranger_multi":
output["metrics_cellranger_stats"] = transform_df(combined_metrics_cellranger)
logger.info(f"Writing output data json to {par['output']}")
output_path = Path(par["output"])
with open(output_path, "w") as f:
json.dump(output, f, indent=2)
report_structures = {
"cellranger_multi": os.path.join(meta["resources_dir"], "report_structure/cellranger.json"),
"xenium": os.path.join(meta["resources_dir"], "report_structure/xenium.json")
}
logger.info(f"Writing output report structure json to {par['output_reporting_json']}")
shutil.copy(report_structures[par["ingestion_method"]], par["output_reporting_json"])
if __name__ == "__main__":
main(par)

View File

@@ -0,0 +1,144 @@
import pytest
import os
import json
import sys
import numpy as np
## VIASH START
meta = {
"resources_dir": "resources_test",
"executable": "./target/executable/ingestion_qc/h5mu_to_qc_json/h5mu_to_qc_json"
}
## VIASH END
def test_cellranger_execution(run_component, tmp_path):
output_json_path = tmp_path / "output.json"
output_reporting_json_path = tmp_path / "output_reporting.json"
run_component(
[
"--input", meta["resources_dir"] + "/resources_test/qc_sample_data/sample_one.qc.cellbender.h5mu",
"--input", meta["resources_dir"] + "/resources_test/qc_sample_data/sample_two.qc.cellbender.h5mu",
"--ingestion_method", "cellranger_multi",
"--output", output_json_path,
"--output_reporting_json", output_reporting_json_path
]
)
assert os.path.exists(output_json_path), "Output file was not created"
with open(output_json_path, "r") as f:
output_json_dict = json.load(f)
assert output_json_dict.keys() == {"cell_rna_stats", "sample_summary_stats", "metrics_cellranger_stats"}
column_names_cell = [col["name"] for col in output_json_dict["cell_rna_stats"]["columns"]]
expected_column_names = [
"sample_id", "total_counts", "num_nonzero_vars",
"fraction_mitochondrial", "fraction_ribosomal",
"cellbender_background_fraction", "cellbender_cell_probability",
"cellbender_cell_size", "cellbender_droplet_efficiency",
"donor_id", "cell_type", "batch", "condition"
]
assert np.all([column in column_names_cell for column in expected_column_names])
for key in output_json_dict.keys():
assert output_json_dict[key].keys() == {"num_rows", "num_cols", "min_total_counts", "min_num_nonzero_vars", "columns"}
for col in output_json_dict[key]["columns"]:
assert {"name", "dtype", "data"}.issubset(col.keys())
def test_set_filters(run_component, tmp_path):
output_json_path = tmp_path / "output.json"
output_reporting_json_path = tmp_path / "output_reporting.json"
run_component(
[
"--input", meta["resources_dir"] + "/resources_test/qc_sample_data/sample_one.qc.cellbender.h5mu",
"--input", meta["resources_dir"] + "/resources_test/qc_sample_data/sample_two.qc.cellbender.h5mu",
"--ingestion_method", "cellranger_multi",
"--output", output_json_path,
"--output_reporting_json", output_reporting_json_path,
"--obs_sample_id", "sample_id",
"--obs_total_counts", "total_counts",
"--obs_num_nonzero_vars", "num_nonzero_vars",
"--obs_fraction_mitochondrial", "fraction_mitochondrial",
"--obs_fraction_ribosomal", "fraction_ribosomal",
"--min_total_counts", "20",
"--min_num_nonzero_vars", "20",
"--obs_metadata", "cell_type"
]
)
assert os.path.exists(output_json_path), "Output file was not created"
with open(output_json_path, "r") as f:
output_json_dict = json.load(f)
assert output_json_dict.keys() == {"cell_rna_stats", "sample_summary_stats", "metrics_cellranger_stats"}
column_names = [col["name"] for col in output_json_dict["cell_rna_stats"]["columns"]]
expected_column_names = [
"sample_id", "total_counts", "num_nonzero_vars",
"fraction_mitochondrial", "fraction_ribosomal",
"cellbender_background_fraction", "cellbender_cell_probability",
"cellbender_cell_size", "cellbender_droplet_efficiency",
"cell_type"
]
unexpected_column_names = ["batch", "condition", "donor_id"]
assert np.all([column in column_names for column in expected_column_names])
assert np.all([column not in column_names for column in unexpected_column_names])
for key in output_json_dict.keys():
assert output_json_dict[key].keys() == {"num_rows", "num_cols", "min_total_counts", "min_num_nonzero_vars", "columns"}
for col in output_json_dict[key]["columns"]:
assert {"name", "dtype", "data"}.issubset(col.keys())
total_counts = next(col for col in output_json_dict["cell_rna_stats"]["columns"] if col["name"] == "total_counts")
assert min(total_counts["data"]) >= 20
num_nonzero_vars = next(col for col in output_json_dict["cell_rna_stats"]["columns"] if col["name"] == "num_nonzero_vars")
assert min(num_nonzero_vars["data"]) >= 20
def test_xenium_execution(run_component, tmp_path):
output_json_path = tmp_path / "output.json"
output_reporting_json_path = tmp_path / "output_reporting.json"
run_component(
[
"--input", meta["resources_dir"] + "/resources_test/spatial_qc_sample_data/xenium_tiny.qc.h5mu",
"--input", meta["resources_dir"] + "/resources_test/spatial_qc_sample_data/xenium_tiny.qc.h5mu",
"--ingestion_method", "xenium",
"--min_num_nonzero_vars", "1",
"--output", output_json_path,
"--output_reporting_json", output_reporting_json_path
]
)
assert os.path.exists(output_json_path), "Output file was not created"
with open(output_json_path, "r") as f:
output_json_dict = json.load(f)
assert output_json_dict.keys() == {"cell_rna_stats", "sample_summary_stats"}
assert "metrics_cellranger_stats" not in output_json_dict.keys()
column_names_cell = [col["name"] for col in output_json_dict["cell_rna_stats"]["columns"]]
expected_column_names = [
"sample_id", "total_counts", "num_nonzero_vars",
"fraction_mitochondrial", "fraction_ribosomal",
"cell_area", "nucleus_ratio",
"x_coord", "y_coord", "cell_id", "segmentation_method", "region"
]
assert np.all([column in column_names_cell for column in expected_column_names])
for key in output_json_dict.keys():
assert output_json_dict[key].keys() == {"num_rows", "num_cols", "min_total_counts", "min_num_nonzero_vars", "columns"}
for col in output_json_dict[key]["columns"]:
assert {"name", "dtype", "data"}.issubset(col.keys())
if __name__ == "__main__":
sys.exit(pytest.main([__file__]))

12
src/utils/setup_logger.py Normal file
View File

@@ -0,0 +1,12 @@
def setup_logger():
import logging
from sys import stdout
logger = logging.getLogger()
logger.setLevel(logging.INFO)
console_handler = logging.StreamHandler(stdout)
logFormatter = logging.Formatter("%(asctime)s %(levelname)-8s %(message)s")
console_handler.setFormatter(logFormatter)
logger.addHandler(console_handler)
return logger

View File

@@ -0,0 +1,147 @@
name: generate_qc_report
namespace: workflows
description: Run the ingestion QC report generation
authors:
- __merge__: /src/authors/jakub_majercik.yaml
roles: [author]
- __merge__: /src/authors/dorien_roosen.yaml
roles: [author]
- __merge__: /src/authors/robrecht_cannoodt.yaml
roles: [author]
- __merge__: /src/authors/weiwei_schultz.yaml
roles: [contributor]
argument_groups:
# TO DO: it would be nice if the sample metadata was already
# included in the h5mu files, so that we don't need to pass it.
- name: Inputs
arguments:
- name: --id
type: string
required: false
direction: input
description: |
The sample IDs to include in the report. If not provided,
the sample IDs will be extracted from the h5mu files.
example: sample1
- name: --input
type: file
required: true
direction: input
description: The input h5mu files.
example: path/to/file1.h5mu
- name: --ingestion_method
type: string
required: true
choices:
- cellranger_multi
- xenium
- name: --sample_metadata
type: file
required: false
direction: input
description: |
The sample metadata file corresponding to .obs fields in the h5mu input files, to be used for grouping in the report.
example: path/to/file.csv
- name: --max_samples_per_report
type: integer
default: 20
description: |
The maximum number of samples to be included per report.
Multiple reports will be generated (with samples equally divided over all reports) if number of input samples exceeds this threshold.
- name: Options
arguments:
- name: "--var_gene_names"
example: "gene_symbol"
type: string
description: |
The column name in the .var h5mu files that contains the gene names. If not provided, .var_names will be used.
- name: --obs_metadata
type: string
multiple: true
description: The metadata keys in the h5mu .obs to include in the report.
example: [donor_id, cell_type, batch, condition]
- name: QC options
arguments:
- name: "--var_name_mitochondrial_genes"
type: string
required: false
default: "mitochondrial"
description: |
In which .var slot to store a boolean array corresponding the mitochondrial genes.
- name: "--var_name_ribosomal_genes"
type: string
required: false
default: "ribosomal"
description: |
In which .var slot to store a boolean array corresponding the ribosomal genes.
- name: --min_total_counts
type: integer
description: |
Minimum total counts for a cell to be included in the output.
default: 10
min: 1
- name: --min_num_nonzero_vars
type: integer
description: |
Minimum number of nonzero vars for a cell to be included in the output.
default: 10
min: 1
- name: Cellbender options
arguments:
- name: "--run_cellbender"
type: boolean
required: false
description: Whether to run cellbender or not.
default: false
- name: "--cellbender_epochs"
type: integer
required: false
description: Number of epochs to train cellbender.
default: 150
- name: Outputs
arguments:
- name: --output_qc_report
type: file
required: true
multiple: true
direction: output
description: The output HTML report
example: path/to/file.html
- name: --output_processed_h5mu
type: file
required: true
direction: output
description: Folder containing the processed h5mu files.
default: qc_h5mu
resources:
- type: nextflow_script
entrypoint: run_wf
path: main.nf
test_resources:
- type: nextflow_script
path: test.nf
entrypoint: test_no_cellbender
- type: nextflow_script
path: test.nf
entrypoint: test_with_cellbender
dependencies:
- name: metadata/add_id
repository: openpipeline
- name: workflows/qc/qc
repository: openpipeline
- name: correction/cellbender_remove_background
alias: cellbender
repository: openpipeline
- name: ingestion_qc/h5mu_to_qc_json
- name: ingestion_qc/generate_html
- name: move_files_to_directory
repository: craftbox
runners:
- type: nextflow

View File

@@ -0,0 +1,37 @@
#!/usr/bin/env bash
# get the root of the directory
REPO_ROOT=$(git rev-parse --show-toplevel)
# ensure that the command below is run from the root of the repository
cd "$REPO_ROOT"
viash ns build --setup cb -q generate_qc_report
nextflow run . \
-main-script src/workflows/generate_qc_report/test.nf \
-profile docker,no_publish,local \
-entry test_no_cellbender \
-c src/configs/labels_ci.config \
-resume
nextflow run . \
-main-script src/workflows/generate_qc_report/test.nf \
-profile docker,no_publish,local \
-entry test_xenium \
-c src/configs/labels_ci.config \
-resume
nextflow run . \
-main-script src/workflows/generate_qc_report/test.nf \
-profile docker,no_publish,local \
-entry test_with_cellbender \
-c src/configs/labels_ci.config \
-resume
nextflow run . \
-main-script src/workflows/generate_qc_report/test.nf \
-profile docker,no_publish,local \
-entry test_multiple_reports \
-c src/configs/labels_ci.config \
-resume

View File

@@ -0,0 +1,208 @@
workflow run_wf {
take: input_ch
main:
qc_ch = input_ch
// store join id
| map { id, state ->
[id, state + [_meta: [join_id: id]]]
}
// add sample ids to each state
| add_id.run(
fromState: [
input_id: "id",
input: "input"
],
args: [
obs_output: "sample_id"
],
toState: [ "input": "output" ]
)
// run cellbender
| cellbender.run(
runIf: {id, state -> state.run_cellbender},
fromState: [
id: "id",
input: "input",
epochs: "cellbender_epochs",
],
args: [
obs_background_fraction: "cellbender_background_fraction",
obs_cell_probability: "cellbender_cell_probability",
obs_droplet_efficiency: "cellbender_droplet_efficiency",
obs_cell_size: "cellbender_cell_size",
],
toState: ["input": "output"]
)
// run qc on each sample
| qc.run(
fromState: [
id: "id",
input: "input",
var_gene_names: "var_gene_names"
],
args: [
var_name_mitochondrial_genes: "mitochondrial",
var_name_ribosomal_genes: "ribosomal",
output_obs_num_nonzero_vars: "num_nonzero_vars",
output_obs_total_counts_vars: "total_counts"
],
toState: { id, output, state ->
def keysToRemove = ["var_gene_names", "var_name_mitochondrial_genes", "var_name_ribosomal_genes", "run_cellbender", "cellbender_epochs"]
def newState = state.findAll{it.key !in keysToRemove}
newState + ["input": output.output]
}
)
| joinStates { ids, states ->
def newId = "qc_data"
// gather keys with unique values across states that should be combined
def new_state_non_unique_values = [
input: states.collect{it.input},
join_ids: states.collect{it._meta.join_id},
_meta: [join_id: ids[0]]
]
// gather keys from different states
def all_state_keys = states.inject([].toSet()){ current_keys, state ->
def new_keys = current_keys + state.keySet()
return new_keys
}.minus(["output", "id", "input", "_meta"])
// Create the new state from the keys, values should be the same across samples
def new_state = all_state_keys.inject([:]){ old_state, argument_name ->
argument_values = states.collect{it.get(argument_name)}.unique()
assert argument_values.size() == 1, "Arguments should be the same across samples. Argument name: $argument_name, \
argument value: $argument_values"
// take the unique value from the set (there is only one)
def argument_value
argument_values.each { argument_value = it }
def current_state = old_state + [(argument_name): argument_value]
return current_state
}
def data_state = new_state_non_unique_values + new_state
[ newId, data_state ]
}
processed_files_ch = qc_ch
// move all processed h5mu files to the same folder
| move_files_to_directory.run(
fromState: [
input: "input",
output: "output_processed_h5mu"
],
toState: [ "output_processed_h5mu": "output" ]
)
| setState(["output_processed_h5mu"])
report_ch = qc_ch
// group the processed samples to generate one or multiple reports
| flatMap { id, state ->
// calculate number of reports to be generated and number of samples per report
def totalInputs = state.input.size()
def maxSamplesPerGroup = state.max_samples_per_report
def numGroups = Math.max(1, Math.ceil(totalInputs / maxSamplesPerGroup) as Integer)
def baseSamplesPerGroup = totalInputs.intdiv(numGroups)
def remainder = totalInputs % numGroups
println "Dividing ${totalInputs} sample(s) over ${numGroups} report(s) (max ${maxSamplesPerGroup} per report)"
// sort inputs to make grouping deterministic
def inputs = []
for (int i = 0; i < state.input.size(); i++) {
inputs << [input: state.input[i], _meta: [join_id: state.join_ids[i]]]
}
def sortedInputs = inputs.sort { it._meta.join_id }
def groups = []
def itemIndex = 0
// create one channel per report
(0..<numGroups).each { groupNum ->
def samplesInGroup = baseSamplesPerGroup + (groupNum < remainder ? 1 : 0)
def groupItems = sortedInputs[itemIndex..<(itemIndex + samplesInGroup)]
def newId = "combined_${groupNum + 1}_of_${numGroups}"
def newState = state.clone() // Copy all the original state
// Override the input and _meta with the grouped items
newState.input = groupItems.collect { it.input }
newState._meta = groupItems[0]._meta
println "Group ${groupNum + 1}: ${samplesInGroup} samples - ${newState._meta}"
groups << [newId, newState]
itemIndex += samplesInGroup
}
return groups
}
// Set aside output for QC report instructions
| map { id, state ->
def new_state = state + ["output_reporting_json": "reporting_json.json"]
[id, new_state]
}
// generate qc json
| h5mu_to_qc_json.run(
fromState: [
input: "input",
ingestion_method: "ingestion_method",
obs_metadata: "obs_metadata",
min_total_counts: "min_total_counts",
min_num_nonzero_vars: "min_num_nonzero_vars"
],
args: [
obs_sample_id: "sample_id",
obs_total_counts: "total_counts",
obs_num_nonzero_vars: "num_nonzero_vars",
obs_fraction_mitochondrial: "fraction_mitochondrial",
obs_fraction_ribosomal: "fraction_ribosomal",
],
toState: [
output: "output",
output_reporting_json: "output_reporting_json"
]
)
// generate html report
| generate_html.run(
fromState: [
input_data: "output",
input_structure: "output_reporting_json"
],
toState: [
output_qc_report: "output_qc_report"
]
)
// collect the reports into a single channel
| joinStates { ids, states ->
def newId = "qc_report"
def report_state = [
output_qc_report: states.collect{it.output_qc_report},
_meta: states[0]._meta
]
[ newId, report_state ]
}
output_ch = report_ch.mix(processed_files_ch)
| joinStates { ids, states ->
assert states.size() == 2, "Expected 2 states, but got ${states.size()}"
assert ids.contains('qc_report'), "Expected one channel to have the id `qc_report`, but got ${ids}"
assert ids.contains('qc_data'), "Expected one channel to have the id `qc_data`, but got ${ids}"
def newId = "combined"
def combined_state = states[0] + states [1]
[ newId, combined_state ]
}
emit: output_ch
}

View File

@@ -0,0 +1,10 @@
manifest {
nextflowVersion = '!>=20.12.1-edge'
}
params {
rootDir = java.nio.file.Paths.get("$projectDir/../../../").toAbsolutePath().normalize().toString()
}
// include common settings
includeConfig("${params.rootDir}/src/configs/labels.config")

View File

@@ -0,0 +1,224 @@
nextflow.enable.dsl=2
targetDir = params.rootDir + "/target/nextflow/workflows"
include { generate_qc_report } from targetDir + "/generate_qc_report/main.nf"
params.resources_test = "s3://openpipelines-bio/openpipeline_incubator/resources_test/"
workflow test_no_cellbender {
resources_test_file = file(params.resources_test)
output_ch = Channel.fromList([
[
id: "sample_1",
input: resources_test_file.resolve("qc_sample_data/sample_one.qc.h5mu"),
run_cellbender: false,
ingestion_method: "cellranger_multi",
var_gene_names: "gene_symbol",
metadata_obs_keys: ["donor_id", "cell_type", "batch", "condition"],
output_html: "report.html",
publish_dir: "test_out"
],
[
id: "sample_2",
input: resources_test_file.resolve("qc_sample_data/sample_two.qc.h5mu"),
ingestion_method: "cellranger_multi",
var_gene_names: "gene_symbol",
run_cellbender: false,
metadata_obs_keys: ["donor_id", "cell_type", "batch", "condition"],
output_html: "report.html",
publish_dir: "test_out"
]
])
| map{ state -> [state.id, state] }
| generate_qc_report
| view { output ->
assert output.size() == 2 : "Outputs should contain two elements; [id, state]"
def id = output[0]
def state = output [1]
assert id == "combined": "Output ID should be `combined`"
assert state instanceof Map : "State should be a map. Found: ${state}"
assert state.containsKey("output_qc_report"): "Output should contain key `output_qc_report`"
assert state.containsKey("output_processed_h5mu"): "Output should contain key `output_processed_h5mu`"
assert state.output_qc_report.size() == 1 : "Expected exactly one output HTML file to be generated"
assert state.output_qc_report.every { it.isFile()} : "All output HTML report file should exist"
assert state.output_processed_h5mu.isDirectory() : "Output directory should exist"
def files = state.output_processed_h5mu.listFiles().findAll { it.isFile() }
assert files.size() == 2 : "Output directory should contain exactly 2 files, but found ${files.size()} files"
"Output: $output"
}
}
workflow test_xenium {
resources_test_file = file(params.resources_test)
output_ch = Channel.fromList([
[
id: "sample_one",
input: resources_test_file.resolve("spatial_qc_sample_data/xenium_tiny.qc.h5mu"),
run_cellbender: false,
ingestion_method: "xenium",
var_gene_names: "gene_ids",
min_num_nonzero_vars: "1",
output_html: "report.html",
publish_dir: "test_out"
],
[
id: "sample_two",
input: resources_test_file.resolve("spatial_qc_sample_data/xenium_tiny.qc.h5mu"),
ingestion_method: "xenium",
var_gene_names: "gene_ids",
min_num_nonzero_vars: "1",
run_cellbender: false,
output_html: "report.html",
publish_dir: "test_out"
]
])
| map{ state -> [state.id, state] }
| generate_qc_report
| view { output ->
assert output.size() == 2 : "Outputs should contain two elements; [id, state]"
def id = output[0]
def state = output [1]
assert id == "combined": "Output ID should be `combined`"
assert state instanceof Map : "State should be a map. Found: ${state}"
assert state.containsKey("output_qc_report"): "Output should contain key `output_qc_report`"
assert state.containsKey("output_processed_h5mu"): "Output should contain key `output_processed_h5mu`"
assert state.output_qc_report.size() == 1 : "Expected exactly one output HTML file to be generated"
assert state.output_qc_report.every { it.isFile()} : "All output HTML report file should exist"
assert state.output_processed_h5mu.isDirectory() : "Output directory should exist"
def files = state.output_processed_h5mu.listFiles().findAll { it.isFile() }
assert files.size() == 2 : "Output directory should contain exactly 2 files, but found ${files.size()} files"
"Output: $output"
}
}
workflow test_with_cellbender {
resources_test_file = file(params.resources_test)
output_ch = Channel.fromList([
[
id: "sample_one",
input: resources_test_file.resolve("qc_sample_data/sample_one.qc.h5mu"),
ingestion_method: "cellranger_multi",
var_gene_names: "gene_symbol",
run_cellbender: true,
cellbender_epochs: 1,
output_html: "report.html",
publish_dir: "test_out"
],
[
id: "sample_two",
input: resources_test_file.resolve("qc_sample_data/sample_two.qc.h5mu"),
ingestion_method: "cellranger_multi",
var_gene_names: "gene_symbol",
run_cellbender: true,
cellbender_epochs: 1,
output_html: "report.html",
publish_dir: "test_out"
]
])
| map{ state -> [state.id, state] }
| generate_qc_report
| view { output ->
assert output.size() == 2 : "Outputs should contain two elements; [id, state]"
def id = output[0]
def state = output [1]
assert id == "combined": "Output ID should be `combined`"
assert state instanceof Map : "State should be a map. Found: ${state}"
assert state.containsKey("output_qc_report"): "Output should contain key `output_qc_report`"
assert state.containsKey("output_processed_h5mu"): "Output should contain key `output_processed_h5mu`"
assert state.output_qc_report.size() == 1 : "Expected exactly one output HTML file to be generated"
assert state.output_qc_report.every { it.isFile()} : "All output HTML report file should exist"
assert state.output_processed_h5mu.isDirectory() : "Output directory should exist"
def files = state.output_processed_h5mu.listFiles().findAll { it.isFile() }
assert files.size() == 2 : "Output directory should contain exactly 2 files, but found ${files.size()} files"
"Output: $output"
}
}
workflow test_multiple_reports {
resources_test_file = file(params.resources_test)
output_ch = Channel.fromList([
[
id: "sample_1",
input: resources_test_file.resolve("qc_sample_data/sample_one.qc.h5mu"),
ingestion_method: "cellranger_multi",
run_cellbender: false,
metadata_obs_keys: ["donor_id", "cell_type", "batch", "condition"],
output_html: "report.html",
max_samples_per_report: 2,
publish_dir: "test_out"
],
[
id: "sample_2",
input: resources_test_file.resolve("qc_sample_data/sample_two.qc.h5mu"),
ingestion_method: "cellranger_multi",
run_cellbender: false,
metadata_obs_keys: ["donor_id", "cell_type", "batch", "condition"],
output_html: "report.html",
max_samples_per_report: 2,
publish_dir: "test_out"
],
[
id: "sample_3",
input: resources_test_file.resolve("qc_sample_data/sample_one.qc.h5mu"),
ingestion_method: "cellranger_multi",
run_cellbender: false,
metadata_obs_keys: ["donor_id", "cell_type", "batch", "condition"],
output_html: "report.html",
max_samples_per_report: 2,
publish_dir: "test_out"
],
[
id: "sample_4",
input: resources_test_file.resolve("qc_sample_data/sample_two.qc.h5mu"),
ingestion_method: "cellranger_multi",
run_cellbender: false,
metadata_obs_keys: ["donor_id", "cell_type", "batch", "condition"],
output_html: "report.html",
max_samples_per_report: 2,
publish_dir: "test_out"
],
[
id: "sample_5",
input: resources_test_file.resolve("qc_sample_data/sample_one.qc.h5mu"),
ingestion_method: "cellranger_multi",
run_cellbender: false,
metadata_obs_keys: ["donor_id", "cell_type", "batch", "condition"],
output_html: "report.html",
max_samples_per_report: 2,
publish_dir: "test_out"
]
])
| map{ state -> [state.id, state] }
| generate_qc_report
| view { output ->
assert output.size() == 2 : "Outputs should contain two elements; [id, state]"
def id = output[0]
def state = output [1]
assert id == "combined": "Output ID should be `combined`"
assert state instanceof Map : "State should be a map. Found: ${state}"
assert state.containsKey("output_qc_report"): "Output should contain key `output_qc_report`"
assert state.containsKey("output_processed_h5mu"): "Output should contain key `output_processed_h5mu`"
assert state.output_qc_report.size() == 3 : "Expected exactly one output HTML file to be generated"
assert state.output_qc_report.every { it.isFile()} : "All output HTML report file should exist"
assert state.output_processed_h5mu.isDirectory() : "Output directory should exist"
def files = state.output_processed_h5mu.listFiles().findAll { it.isFile() }
assert files.size() == 5 : "Output directory should contain exactly 5 files, but found ${files.size()} files"
"Output: $output"
}
}

0
target/.build.yaml Normal file
View File

View File

@@ -0,0 +1,484 @@
name: "h5mu_to_qc_json"
namespace: "ingestion_qc"
version: "v0.1.0"
authors:
- name: "Jakub Majercik"
roles:
- "author"
info:
role: "Contributor"
links:
email: "jakub@data-intuitive.com"
github: "jakubmajercik"
linkedin: "jakubmajercik"
organizations:
- name: "Data Intuitive"
href: "https://www.data-intuitive.com"
role: "Bioinformatics Engineer"
- name: "Dorien Roosen"
roles:
- "author"
info:
role: "Core Team Member"
links:
email: "dorien@data-intuitive.com"
github: "dorien-er"
linkedin: "dorien-roosen"
organizations:
- name: "Data Intuitive"
href: "https://www.data-intuitive.com"
role: "Data Scientist"
- name: "Robrecht Cannoodt"
roles:
- "author"
info:
role: "Core Team Member"
links:
email: "robrecht@data-intuitive.com"
github: "rcannood"
orcid: "0000-0003-3641-729X"
linkedin: "robrechtcannoodt"
organizations:
- name: "Data Intuitive"
href: "https://www.data-intuitive.com"
role: "Data Science Engineer"
- name: "Open Problems"
href: "https://openproblems.bio"
role: "Core Member"
- name: "Weiwei Schultz"
roles:
- "contributor"
info:
role: "Contributor"
organizations:
- name: "Janssen R&D US"
role: "Associate Director Data Sciences"
argument_groups:
- name: "Inputs"
arguments:
- type: "file"
name: "--input"
description: "The input h5mu file(s)"
info: null
example:
- "path/to/file.h5mu"
must_exist: true
create_parent: true
required: true
direction: "input"
multiple: true
multiple_sep: ";"
- type: "string"
name: "--modality"
description: "The modality to use"
info: null
default:
- "rna"
required: false
direction: "input"
multiple: false
multiple_sep: ";"
- type: "string"
name: "--ingestion_method"
description: "Method that was used to ingest the data - this will define the structure\
\ of the report that is generated."
info: null
required: true
choices:
- "cellranger_multi"
- "xenium"
direction: "input"
multiple: false
multiple_sep: ";"
- type: "string"
name: "--obs_sample_id"
description: "The key in the h5mu file that contains the sample ID. If not provided,\
\ each H5MU file will be considered as a separate sample."
info: null
default:
- "sample_id"
required: false
direction: "input"
multiple: false
multiple_sep: ";"
- type: "string"
name: "--obs_total_counts"
description: "The key in the h5mu .obs field that contains the total counts."
info: null
default:
- "total_counts"
required: false
direction: "input"
multiple: false
multiple_sep: ";"
- type: "string"
name: "--obs_num_nonzero_vars"
description: "The key in the h5mu .obs field that contains the number of nonzero\
\ vars."
info: null
default:
- "num_nonzero_vars"
required: false
direction: "input"
multiple: false
multiple_sep: ";"
- type: "string"
name: "--obs_fraction_mitochondrial"
description: "The key in the h5mu .obs field that contains the fraction mitochondrial\
\ genes."
info: null
default:
- "fraction_mitochondrial"
required: false
direction: "input"
multiple: false
multiple_sep: ";"
- type: "string"
name: "--obs_fraction_ribosomal"
description: "The key in the h5mu .obs field that contains the fraction ribosomal\
\ genes."
info: null
default:
- "fraction_ribosomal"
required: false
direction: "input"
multiple: false
multiple_sep: ";"
- name: "Outputs"
arguments:
- type: "file"
name: "--output"
description: "The output JSON file"
info: null
example:
- "path/to/file.json"
must_exist: true
create_parent: true
required: true
direction: "output"
multiple: false
multiple_sep: ";"
- type: "file"
name: "--output_reporting_json"
description: "The output JSON file that defines the QC report"
info: null
example:
- "path/to/file.json"
must_exist: true
create_parent: true
required: true
direction: "output"
multiple: false
multiple_sep: ";"
- name: "Filtering & grouping options"
arguments:
- type: "integer"
name: "--min_total_counts"
description: "Minimum total counts for a cell to be included in the output"
info: null
default:
- 10
required: false
direction: "input"
multiple: false
multiple_sep: ";"
- type: "integer"
name: "--min_num_nonzero_vars"
description: "Minimum number of nonzero vars for a cell to be included in the\
\ output"
info: null
default:
- 10
required: false
direction: "input"
multiple: false
multiple_sep: ";"
- type: "string"
name: "--obs_metadata"
description: "The metadata keys in the h5mu .obs to include in the output JSON."
info: null
example:
- "donor_id;cell_type;batch;condition"
required: false
direction: "input"
multiple: true
multiple_sep: ";"
- name: "Options for CellRanger reports"
arguments:
- type: "string"
name: "--obs_cellbender"
description: "The cellbender keys in the h5mu .obs to include in the output JSON"
info: null
default:
- "cellbender_background_fraction"
- "cellbender_cell_probability"
- "cellbender_cell_size"
- "cellbender_droplet_efficiency"
required: false
direction: "input"
multiple: true
multiple_sep: ";"
- type: "string"
name: "--uns_cellranger_metrics"
description: "The key in the h5mu file .uns that contains the cellranger metrics"
info: null
default:
- "metrics_cellranger"
required: false
direction: "input"
multiple: false
multiple_sep: ";"
- name: "Options for Xenium reports"
arguments:
- type: "string"
name: "--obs_nucleus_area"
description: "The key in the h5mu .obs field that contains the nucleus area."
info: null
default:
- "nucleus_area"
required: false
direction: "input"
multiple: false
multiple_sep: ";"
- type: "string"
name: "--obs_cell_area"
description: "The key in the h5mu .obs field that contains the cell area."
info: null
default:
- "cell_area"
required: false
direction: "input"
multiple: false
multiple_sep: ";"
- type: "string"
name: "--obs_x_coord"
description: "The key in the h5mu .obs field that contains the x coordinate."
info: null
default:
- "x_coord"
required: false
direction: "input"
multiple: false
multiple_sep: ";"
- type: "string"
name: "--obs_y_coord"
description: "The key in the h5mu .obs field that contains the y coordinate."
info: null
default:
- "y_coord"
required: false
direction: "input"
multiple: false
multiple_sep: ";"
- type: "string"
name: "--obs_control_probe_counts"
description: "The key in the h5mu .obs field that contains the number of control\
\ probes."
info: null
default:
- "control_probe_counts"
required: false
direction: "input"
multiple: false
multiple_sep: ";"
- type: "string"
name: "--obs_control_codeword_counts"
description: "The key in the h5mu .obs field that contains the number of control\
\ codewords."
info: null
default:
- "control_codeword_counts"
required: false
direction: "input"
multiple: false
multiple_sep: ";"
resources:
- type: "python_script"
path: "script.py"
is_executable: true
- type: "file"
path: "setup_logger.py"
- type: "file"
path: "report_structure"
- type: "file"
path: "nextflow_labels.config"
dest: "nextflow_labels.config"
description: "Takes H5MU files that have been ingested by CellRanger, Xenium or CosMx\
\ and processed by the QC workflow, and generates:\n- A JSON file that contains\
\ the combined data for the QC report\n- A JSON file that defines the layout and\
\ structure of the QC report\n"
test_resources:
- type: "python_script"
path: "test.py"
is_executable: true
- type: "file"
path: "resources_test"
info: null
status: "enabled"
scope:
image: "private"
target: "private"
requirements:
commands:
- "ps"
repositories:
- type: "github"
name: "openpipeline"
repo: "openpipelines-bio/openpipeline"
tag: "2.1.2"
- type: "vsh"
name: "craftbox"
repo: "craftbox"
tag: "v0.2.0"
links:
repository: "https://github.com/openpipelines-bio/openpipeline_qc"
docker_registry: "ghcr.io"
runners:
- type: "executable"
id: "executable"
docker_setup_strategy: "ifneedbepullelsecachedbuild"
- type: "nextflow"
id: "nextflow"
directives:
label:
- "midmem"
- "middisk"
tag: "$id"
auto:
simplifyInput: true
simplifyOutput: false
transcript: false
publish: false
config:
labels:
mem1gb: "memory = 1000000000.B"
mem2gb: "memory = 2000000000.B"
mem5gb: "memory = 5000000000.B"
mem10gb: "memory = 10000000000.B"
mem20gb: "memory = 20000000000.B"
mem50gb: "memory = 50000000000.B"
mem100gb: "memory = 100000000000.B"
mem200gb: "memory = 200000000000.B"
mem500gb: "memory = 500000000000.B"
mem1tb: "memory = 1000000000000.B"
mem2tb: "memory = 2000000000000.B"
mem5tb: "memory = 5000000000000.B"
mem10tb: "memory = 10000000000000.B"
mem20tb: "memory = 20000000000000.B"
mem50tb: "memory = 50000000000000.B"
mem100tb: "memory = 100000000000000.B"
mem200tb: "memory = 200000000000000.B"
mem500tb: "memory = 500000000000000.B"
mem1gib: "memory = 1073741824.B"
mem2gib: "memory = 2147483648.B"
mem4gib: "memory = 4294967296.B"
mem8gib: "memory = 8589934592.B"
mem16gib: "memory = 17179869184.B"
mem32gib: "memory = 34359738368.B"
mem64gib: "memory = 68719476736.B"
mem128gib: "memory = 137438953472.B"
mem256gib: "memory = 274877906944.B"
mem512gib: "memory = 549755813888.B"
mem1tib: "memory = 1099511627776.B"
mem2tib: "memory = 2199023255552.B"
mem4tib: "memory = 4398046511104.B"
mem8tib: "memory = 8796093022208.B"
mem16tib: "memory = 17592186044416.B"
mem32tib: "memory = 35184372088832.B"
mem64tib: "memory = 70368744177664.B"
mem128tib: "memory = 140737488355328.B"
mem256tib: "memory = 281474976710656.B"
mem512tib: "memory = 562949953421312.B"
cpu1: "cpus = 1"
cpu2: "cpus = 2"
cpu5: "cpus = 5"
cpu10: "cpus = 10"
cpu20: "cpus = 20"
cpu50: "cpus = 50"
cpu100: "cpus = 100"
cpu200: "cpus = 200"
cpu500: "cpus = 500"
cpu1000: "cpus = 1000"
script:
- "includeConfig(\"nextflow_labels.config\")"
debug: false
container: "docker"
engines:
- type: "docker"
id: "docker"
image: "python:3.12-slim"
target_registry: "images.viash-hub.com"
target_tag: "v0.1.0"
namespace_separator: "/"
setup:
- type: "apt"
packages:
- "procps"
interactive: false
- type: "python"
user: false
packages:
- "anndata~=0.11.1"
- "mudata~=0.3.1"
script:
- "exec(\"try:\\n import awkward\\nexcept ModuleNotFoundError:\\n exit(0)\\\
nelse: exit(1)\")"
upgrade: true
test_setup:
- type: "apt"
packages:
- "git"
interactive: false
- type: "python"
user: false
packages:
- "viashpy==0.8.0"
github:
- "openpipelines-bio/core#subdirectory=packages/python/openpipeline_testutils"
upgrade: true
entrypoint: []
cmd: null
- type: "native"
id: "native"
build_info:
config: "src/ingestion_qc/h5mu_to_qc_json/config.vsh.yaml"
runner: "executable"
engine: "docker|native"
output: "target/_private/executable/ingestion_qc/h5mu_to_qc_json"
executable: "target/_private/executable/ingestion_qc/h5mu_to_qc_json/h5mu_to_qc_json"
viash_version: "0.9.4"
git_commit: "4de00a2614069bdaee27943e73a51d378e465c60"
git_remote: "https://github.com/openpipelines-bio/openpipeline_qc"
git_tag: "v0.1.0"
package_config:
name: "openpipeline_qc"
version: "v0.1.0"
info:
test_resources:
- type: "s3"
path: "s3://openpipelines-bio/openpipeline_incubator/resources_test"
dest: "resources_test"
repositories:
- type: "github"
name: "openpipeline"
repo: "openpipelines-bio/openpipeline"
tag: "2.1.2"
- type: "vsh"
name: "craftbox"
repo: "craftbox"
tag: "v0.2.0"
viash_version: "0.9.4"
source: "src"
target: "target"
config_mods:
- ".requirements.commands := ['ps']\n.runners[.type == 'nextflow'].directives.tag\
\ := '$id'\n.resources += {path: '/src/configs/labels.config', dest: 'nextflow_labels.config'}\n\
.runners[.type == 'nextflow'].config.script := 'includeConfig(\"nextflow_labels.config\"\
)'"
- ".engines += { type: \"native\" }"
- ".engines[.type == 'docker'].target_registry := 'images.viash-hub.com'"
- ".engines[.type == 'docker'].target_tag := 'v0.1.0'"
organization: "vsh"
links:
repository: "https://github.com/openpipelines-bio/openpipeline_qc"
docker_registry: "ghcr.io"

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,66 @@
process {
// Default resources for components that hardly do any processing
memory = { 2.GB * task.attempt }
cpus = 1
// Retry for exit codes that have something to do with memory issues
errorStrategy = { task.exitStatus in 137..140 ? 'retry' : 'terminate' }
maxRetries = 3
maxMemory = null
// CPU resources
withLabel: singlecpu { cpus = 1 }
withLabel: lowcpu { cpus = 4 }
withLabel: midcpu { cpus = 10 }
withLabel: highcpu { cpus = 20 }
// Memory resources
withLabel: verylowmem { memory = { get_memory( 4.GB * task.attempt ) } }
withLabel: lowmem { memory = { get_memory( 8.GB * task.attempt ) } }
withLabel: midmem { memory = { get_memory( 16.GB * task.attempt ) } }
withLabel: highmem { memory = { get_memory( 64.GB * task.attempt ) } }
withLabel: veryhighmem { memory = { get_memory( 75.GB * task.attempt ) } }
// Disk space
withLabel: lowdisk {
disk = {process.disk ? process.disk : null}
}
withLabel: middisk {
disk = {process.disk ? process.disk : null}
}
withLabel: highdisk {
disk = {process.disk ? process.disk : null}
}
withLabel: veryhighdisk {
disk = {process.disk ? process.disk : null}
}
// NOTE: The above labels intentionally do not have an effect by default.
// The user should set the disk space requirements by adding the following
// to the compute environment:
//
// withLabel: lowdisk { disk = { 20.GB * task.attempt } }
// withLabel: middisk { disk = { 100.GB * task.attempt } }
// withLabel: highdisk { disk = { 200.GB * task.attempt } }
// withLabel: veryhighdisk { disk = { 500.GB * task.attempt } }
}
def get_memory(to_compare) {
if (!process.containsKey("maxMemory") || !process.maxMemory) {
return to_compare
}
try {
if (process.containsKey("maxRetries") && process.maxRetries && task.attempt == (process.maxRetries as int)) {
return process.maxMemory
}
else if (to_compare.compareTo(process.maxMemory as nextflow.util.MemoryUnit) == 1) {
return max_memory as nextflow.util.MemoryUnit
}
else {
return to_compare
}
} catch (all) {
println "Error processing memory resources. Please check that process.maxMemory '${process.maxMemory}' and process.maxRetries '${process.maxRetries}' are valid!"
System.exit(1)
}
}

View File

@@ -0,0 +1,162 @@
{
"categories": [
{
"name": "Sample QC",
"key": "sample_summary_stats",
"additionalAxes": false,
"defaultFilters": []
},
{
"name": "SampleQC",
"key": "metrics_cellranger_stats",
"additionalAxes": false,
"defaultFilters": [
{
"type": "bar",
"field": "Number_of_reads_in_the_library",
"label": "Number of reads per library",
"description": "Sequencing depth per sample. Higher values generally indicate more comprehensive cell profiling.",
"nBins": 10,
"groupBy": "sample_id",
"xAxisType": "linear",
"yAxisType": "linear"
},
{
"type": "bar",
"field": "Confidently_mapped_reads_in_cells",
"label": "Confidently mapped reads in cells",
"description": "Number of reads that were mapped unambiguously to the reference genome within cell-containing droplets.",
"groupBy": "sample_id",
"nBins": 10,
"yAxisType": "linear"
},
{
"type": "bar",
"field": "Estimated_number_of_cells",
"label": "Estimated number of cells",
"description": "CellRanger's estimate of the number of cells per sample based on the UMI count distribution.",
"groupBy": "sample_id",
"nBins": 10,
"yAxisType": "linear"
},
{
"type": "bar",
"field": "Sequencing_saturation",
"label": "Sequencing saturation",
"description": "Fraction of reads that are duplicates of existing UMIs. Higher values suggest deeper sequencing coverage.",
"groupBy": "sample_id",
"nBins": 10,
"yAxisType": "linear"
}
]
},
{
"name": "Cell RNA QC",
"key": "cell_rna_stats",
"additionalAxes": true,
"defaultFilters": [
{
"type": "histogram",
"field": "total_counts",
"label": "Total UMI per cell",
"description": "Total number of RNA molecules detected per cell. Low values typically indicate empty droplets or low-quality cells that should be filtered out.",
"cutoffMin": null,
"cutoffMax": null,
"zoomMax": null,
"nBins": 50,
"groupBy": "sample_id",
"yAxisType": "linear"
},
{
"type": "histogram",
"field": "num_nonzero_vars",
"label": "Number of non-zero genes per cell",
"description": "Count of unique genes detected in each cell. Low gene counts often indicate poor-quality cells.",
"cutoffMin": null,
"cutoffMax": null,
"zoomMax": null,
"nBins": 50,
"groupBy": "sample_id",
"yAxisType": "linear"
},
{
"type": "histogram",
"field": "fraction_mitochondrial",
"label": "Fraction UMI of mitochondrial genes per cell",
"description": "Proportion of cell's RNA from mitochondrial genes.",
"cutoffMin": null,
"cutoffMax": null,
"nBins": 50,
"groupBy": "sample_id",
"yAxisType": "linear"
},
{
"type": "histogram",
"field": "fraction_ribosomal",
"label": "Fraction UMI of ribosomal genes per cell",
"description": "Proportion of cell's RNA from ribosomal protein genes. Extreme values may indicate stress responses or cell cycle abnormalities.",
"cutoffMin": null,
"cutoffMax": null,
"nBins": 50,
"groupBy": "sample_id",
"yAxisType": "linear"
},
{
"type": "histogram",
"field": "pct_of_counts_in_top_50_vars",
"label": "Fraction UMI in top 50 genes per cell",
"description": "Proportion of RNA molecules from the 50 most-expressed genes in each cell.",
"cutoffMin": null,
"cutoffMax": null,
"nBins": 50,
"groupBy": "sample_id",
"yAxisType": "linear"
},
{
"type": "histogram",
"field": "cellbender_cell_probability",
"label": "CellBender cell probability",
"description": "CellBender's statistical confidence (0-1) that a barcode represents a real cell, with higher values indicating stronger confidence.",
"cutoffMin": null,
"cutoffMax": null,
"nBins": 50,
"groupBy": "sample_id",
"yAxisType": "linear"
},
{
"type": "histogram",
"field": "cellbender_background_fraction",
"label": "CellBender background fraction",
"description": "Estimated percentage of each cell's RNA that comes from the ambient solution rather than the cell itself.",
"cutoffMin": null,
"cutoffMax": null,
"nBins": 50,
"groupBy": "sample_id",
"yAxisType": "linear"
},
{
"type": "histogram",
"field": "cellbender_cell_size",
"label": "CellBender cell size",
"description": "CellBender's estimate of the true number of RNA molecules in each cell after removing ambient contamination. Reflects actual cell RNA content rather than raw UMI counts.",
"cutoffMin": null,
"cutoffMax": null,
"nBins": 50,
"groupBy": "sample_id",
"yAxisType": "linear"
},
{
"type": "histogram",
"field": "cellbender_droplet_efficiency",
"label": "CellBender droplet efficiency",
"description": "CellBender's estimate of how efficiently each droplet captured RNA molecules. Higher values indicate more reliable RNA sampling within individual droplets.",
"cutoffMin": null,
"cutoffMax": null,
"nBins": 50,
"groupBy": "sample_id",
"yAxisType": "linear"
}
]
}
]
}

View File

@@ -0,0 +1,91 @@
{
"categories": [
{
"name": "Sample QC",
"key": "sample_summary_stats",
"additionalAxes": false,
"defaultFilters": []
},
{
"name": "Cell RNA QC",
"key": "cell_rna_stats",
"additionalAxes": true,
"defaultFilters": [
{
"type": "histogram",
"visualizationType": "histogram",
"field": "total_counts",
"label": "Total UMI per cell",
"description": "Total number of RNA molecules detected per cell. Low values typically indicate empty droplets or low-quality cells that should be filtered out.",
"cutoffMin": null,
"cutoffMax": null,
"zoomMax": null,
"nBins": 50,
"groupBy": "sample_id",
"yAxisType": "linear"
},
{
"type": "histogram",
"visualizationType": "histogram",
"field": "num_nonzero_vars",
"label": "Number of non-zero genes per cell",
"description": "Count of unique genes detected in each cell. Low gene counts often indicate poor-quality cells.",
"cutoffMin": null,
"cutoffMax": null,
"zoomMax": null,
"nBins": 50,
"groupBy": "sample_id",
"yAxisType": "linear"
},
{
"type": "histogram",
"visualizationType": "histogram",
"field": "fraction_mitochondrial",
"label": "Fraction UMI of mitochondrial genes per cell",
"description": "Proportion of cell's RNA from mitochondrial genes.",
"cutoffMin": null,
"cutoffMax": null,
"nBins": 50,
"groupBy": "sample_id",
"yAxisType": "linear"
},
{
"type": "histogram",
"visualizationType": "histogram",
"field": "fraction_ribosomal",
"label": "Fraction UMI of ribosomal genes per cell",
"description": "Proportion of cell's RNA from ribosomal protein genes. Extreme values may indicate stress responses or cell cycle abnormalities.",
"cutoffMin": null,
"cutoffMax": null,
"nBins": 50,
"groupBy": "sample_id",
"yAxisType": "linear"
},
{
"type": "histogram",
"visualizationType": "histogram",
"field": "cell_area",
"label": "Segmented cell area",
"description": "Area of the segmented cells.",
"cutoffMin": null,
"cutoffMax": null,
"nBins": 50,
"groupBy": "sample_id",
"yAxisType": "linear"
},
{
"type": "histogram",
"visualizationType": "histogram",
"field": "nucleus_ratio",
"label": "Nucleus Ratio",
"description": "Ratio of the nucleus area to the segmented cell area.",
"cutoffMin": null,
"cutoffMax": null,
"nBins": 50,
"groupBy": "sample_id",
"yAxisType": "linear"
}
]
}
]
}

View File

@@ -0,0 +1,12 @@
def setup_logger():
import logging
from sys import stdout
logger = logging.getLogger()
logger.setLevel(logging.INFO)
console_handler = logging.StreamHandler(stdout)
logFormatter = logging.Formatter("%(asctime)s %(levelname)-8s %(message)s")
console_handler.setFormatter(logFormatter)
logger.addHandler(console_handler)
return logger

View File

@@ -0,0 +1,484 @@
name: "h5mu_to_qc_json"
namespace: "ingestion_qc"
version: "v0.1.0"
authors:
- name: "Jakub Majercik"
roles:
- "author"
info:
role: "Contributor"
links:
email: "jakub@data-intuitive.com"
github: "jakubmajercik"
linkedin: "jakubmajercik"
organizations:
- name: "Data Intuitive"
href: "https://www.data-intuitive.com"
role: "Bioinformatics Engineer"
- name: "Dorien Roosen"
roles:
- "author"
info:
role: "Core Team Member"
links:
email: "dorien@data-intuitive.com"
github: "dorien-er"
linkedin: "dorien-roosen"
organizations:
- name: "Data Intuitive"
href: "https://www.data-intuitive.com"
role: "Data Scientist"
- name: "Robrecht Cannoodt"
roles:
- "author"
info:
role: "Core Team Member"
links:
email: "robrecht@data-intuitive.com"
github: "rcannood"
orcid: "0000-0003-3641-729X"
linkedin: "robrechtcannoodt"
organizations:
- name: "Data Intuitive"
href: "https://www.data-intuitive.com"
role: "Data Science Engineer"
- name: "Open Problems"
href: "https://openproblems.bio"
role: "Core Member"
- name: "Weiwei Schultz"
roles:
- "contributor"
info:
role: "Contributor"
organizations:
- name: "Janssen R&D US"
role: "Associate Director Data Sciences"
argument_groups:
- name: "Inputs"
arguments:
- type: "file"
name: "--input"
description: "The input h5mu file(s)"
info: null
example:
- "path/to/file.h5mu"
must_exist: true
create_parent: true
required: true
direction: "input"
multiple: true
multiple_sep: ";"
- type: "string"
name: "--modality"
description: "The modality to use"
info: null
default:
- "rna"
required: false
direction: "input"
multiple: false
multiple_sep: ";"
- type: "string"
name: "--ingestion_method"
description: "Method that was used to ingest the data - this will define the structure\
\ of the report that is generated."
info: null
required: true
choices:
- "cellranger_multi"
- "xenium"
direction: "input"
multiple: false
multiple_sep: ";"
- type: "string"
name: "--obs_sample_id"
description: "The key in the h5mu file that contains the sample ID. If not provided,\
\ each H5MU file will be considered as a separate sample."
info: null
default:
- "sample_id"
required: false
direction: "input"
multiple: false
multiple_sep: ";"
- type: "string"
name: "--obs_total_counts"
description: "The key in the h5mu .obs field that contains the total counts."
info: null
default:
- "total_counts"
required: false
direction: "input"
multiple: false
multiple_sep: ";"
- type: "string"
name: "--obs_num_nonzero_vars"
description: "The key in the h5mu .obs field that contains the number of nonzero\
\ vars."
info: null
default:
- "num_nonzero_vars"
required: false
direction: "input"
multiple: false
multiple_sep: ";"
- type: "string"
name: "--obs_fraction_mitochondrial"
description: "The key in the h5mu .obs field that contains the fraction mitochondrial\
\ genes."
info: null
default:
- "fraction_mitochondrial"
required: false
direction: "input"
multiple: false
multiple_sep: ";"
- type: "string"
name: "--obs_fraction_ribosomal"
description: "The key in the h5mu .obs field that contains the fraction ribosomal\
\ genes."
info: null
default:
- "fraction_ribosomal"
required: false
direction: "input"
multiple: false
multiple_sep: ";"
- name: "Outputs"
arguments:
- type: "file"
name: "--output"
description: "The output JSON file"
info: null
example:
- "path/to/file.json"
must_exist: true
create_parent: true
required: true
direction: "output"
multiple: false
multiple_sep: ";"
- type: "file"
name: "--output_reporting_json"
description: "The output JSON file that defines the QC report"
info: null
example:
- "path/to/file.json"
must_exist: true
create_parent: true
required: true
direction: "output"
multiple: false
multiple_sep: ";"
- name: "Filtering & grouping options"
arguments:
- type: "integer"
name: "--min_total_counts"
description: "Minimum total counts for a cell to be included in the output"
info: null
default:
- 10
required: false
direction: "input"
multiple: false
multiple_sep: ";"
- type: "integer"
name: "--min_num_nonzero_vars"
description: "Minimum number of nonzero vars for a cell to be included in the\
\ output"
info: null
default:
- 10
required: false
direction: "input"
multiple: false
multiple_sep: ";"
- type: "string"
name: "--obs_metadata"
description: "The metadata keys in the h5mu .obs to include in the output JSON."
info: null
example:
- "donor_id;cell_type;batch;condition"
required: false
direction: "input"
multiple: true
multiple_sep: ";"
- name: "Options for CellRanger reports"
arguments:
- type: "string"
name: "--obs_cellbender"
description: "The cellbender keys in the h5mu .obs to include in the output JSON"
info: null
default:
- "cellbender_background_fraction"
- "cellbender_cell_probability"
- "cellbender_cell_size"
- "cellbender_droplet_efficiency"
required: false
direction: "input"
multiple: true
multiple_sep: ";"
- type: "string"
name: "--uns_cellranger_metrics"
description: "The key in the h5mu file .uns that contains the cellranger metrics"
info: null
default:
- "metrics_cellranger"
required: false
direction: "input"
multiple: false
multiple_sep: ";"
- name: "Options for Xenium reports"
arguments:
- type: "string"
name: "--obs_nucleus_area"
description: "The key in the h5mu .obs field that contains the nucleus area."
info: null
default:
- "nucleus_area"
required: false
direction: "input"
multiple: false
multiple_sep: ";"
- type: "string"
name: "--obs_cell_area"
description: "The key in the h5mu .obs field that contains the cell area."
info: null
default:
- "cell_area"
required: false
direction: "input"
multiple: false
multiple_sep: ";"
- type: "string"
name: "--obs_x_coord"
description: "The key in the h5mu .obs field that contains the x coordinate."
info: null
default:
- "x_coord"
required: false
direction: "input"
multiple: false
multiple_sep: ";"
- type: "string"
name: "--obs_y_coord"
description: "The key in the h5mu .obs field that contains the y coordinate."
info: null
default:
- "y_coord"
required: false
direction: "input"
multiple: false
multiple_sep: ";"
- type: "string"
name: "--obs_control_probe_counts"
description: "The key in the h5mu .obs field that contains the number of control\
\ probes."
info: null
default:
- "control_probe_counts"
required: false
direction: "input"
multiple: false
multiple_sep: ";"
- type: "string"
name: "--obs_control_codeword_counts"
description: "The key in the h5mu .obs field that contains the number of control\
\ codewords."
info: null
default:
- "control_codeword_counts"
required: false
direction: "input"
multiple: false
multiple_sep: ";"
resources:
- type: "python_script"
path: "script.py"
is_executable: true
- type: "file"
path: "setup_logger.py"
- type: "file"
path: "report_structure"
- type: "file"
path: "nextflow_labels.config"
dest: "nextflow_labels.config"
description: "Takes H5MU files that have been ingested by CellRanger, Xenium or CosMx\
\ and processed by the QC workflow, and generates:\n- A JSON file that contains\
\ the combined data for the QC report\n- A JSON file that defines the layout and\
\ structure of the QC report\n"
test_resources:
- type: "python_script"
path: "test.py"
is_executable: true
- type: "file"
path: "resources_test"
info: null
status: "enabled"
scope:
image: "private"
target: "private"
requirements:
commands:
- "ps"
repositories:
- type: "github"
name: "openpipeline"
repo: "openpipelines-bio/openpipeline"
tag: "2.1.2"
- type: "vsh"
name: "craftbox"
repo: "craftbox"
tag: "v0.2.0"
links:
repository: "https://github.com/openpipelines-bio/openpipeline_qc"
docker_registry: "ghcr.io"
runners:
- type: "executable"
id: "executable"
docker_setup_strategy: "ifneedbepullelsecachedbuild"
- type: "nextflow"
id: "nextflow"
directives:
label:
- "midmem"
- "middisk"
tag: "$id"
auto:
simplifyInput: true
simplifyOutput: false
transcript: false
publish: false
config:
labels:
mem1gb: "memory = 1000000000.B"
mem2gb: "memory = 2000000000.B"
mem5gb: "memory = 5000000000.B"
mem10gb: "memory = 10000000000.B"
mem20gb: "memory = 20000000000.B"
mem50gb: "memory = 50000000000.B"
mem100gb: "memory = 100000000000.B"
mem200gb: "memory = 200000000000.B"
mem500gb: "memory = 500000000000.B"
mem1tb: "memory = 1000000000000.B"
mem2tb: "memory = 2000000000000.B"
mem5tb: "memory = 5000000000000.B"
mem10tb: "memory = 10000000000000.B"
mem20tb: "memory = 20000000000000.B"
mem50tb: "memory = 50000000000000.B"
mem100tb: "memory = 100000000000000.B"
mem200tb: "memory = 200000000000000.B"
mem500tb: "memory = 500000000000000.B"
mem1gib: "memory = 1073741824.B"
mem2gib: "memory = 2147483648.B"
mem4gib: "memory = 4294967296.B"
mem8gib: "memory = 8589934592.B"
mem16gib: "memory = 17179869184.B"
mem32gib: "memory = 34359738368.B"
mem64gib: "memory = 68719476736.B"
mem128gib: "memory = 137438953472.B"
mem256gib: "memory = 274877906944.B"
mem512gib: "memory = 549755813888.B"
mem1tib: "memory = 1099511627776.B"
mem2tib: "memory = 2199023255552.B"
mem4tib: "memory = 4398046511104.B"
mem8tib: "memory = 8796093022208.B"
mem16tib: "memory = 17592186044416.B"
mem32tib: "memory = 35184372088832.B"
mem64tib: "memory = 70368744177664.B"
mem128tib: "memory = 140737488355328.B"
mem256tib: "memory = 281474976710656.B"
mem512tib: "memory = 562949953421312.B"
cpu1: "cpus = 1"
cpu2: "cpus = 2"
cpu5: "cpus = 5"
cpu10: "cpus = 10"
cpu20: "cpus = 20"
cpu50: "cpus = 50"
cpu100: "cpus = 100"
cpu200: "cpus = 200"
cpu500: "cpus = 500"
cpu1000: "cpus = 1000"
script:
- "includeConfig(\"nextflow_labels.config\")"
debug: false
container: "docker"
engines:
- type: "docker"
id: "docker"
image: "python:3.12-slim"
target_registry: "images.viash-hub.com"
target_tag: "v0.1.0"
namespace_separator: "/"
setup:
- type: "apt"
packages:
- "procps"
interactive: false
- type: "python"
user: false
packages:
- "anndata~=0.11.1"
- "mudata~=0.3.1"
script:
- "exec(\"try:\\n import awkward\\nexcept ModuleNotFoundError:\\n exit(0)\\\
nelse: exit(1)\")"
upgrade: true
test_setup:
- type: "apt"
packages:
- "git"
interactive: false
- type: "python"
user: false
packages:
- "viashpy==0.8.0"
github:
- "openpipelines-bio/core#subdirectory=packages/python/openpipeline_testutils"
upgrade: true
entrypoint: []
cmd: null
- type: "native"
id: "native"
build_info:
config: "src/ingestion_qc/h5mu_to_qc_json/config.vsh.yaml"
runner: "nextflow"
engine: "docker|native"
output: "target/_private/nextflow/ingestion_qc/h5mu_to_qc_json"
executable: "target/_private/nextflow/ingestion_qc/h5mu_to_qc_json/main.nf"
viash_version: "0.9.4"
git_commit: "4de00a2614069bdaee27943e73a51d378e465c60"
git_remote: "https://github.com/openpipelines-bio/openpipeline_qc"
git_tag: "v0.1.0"
package_config:
name: "openpipeline_qc"
version: "v0.1.0"
info:
test_resources:
- type: "s3"
path: "s3://openpipelines-bio/openpipeline_incubator/resources_test"
dest: "resources_test"
repositories:
- type: "github"
name: "openpipeline"
repo: "openpipelines-bio/openpipeline"
tag: "2.1.2"
- type: "vsh"
name: "craftbox"
repo: "craftbox"
tag: "v0.2.0"
viash_version: "0.9.4"
source: "src"
target: "target"
config_mods:
- ".requirements.commands := ['ps']\n.runners[.type == 'nextflow'].directives.tag\
\ := '$id'\n.resources += {path: '/src/configs/labels.config', dest: 'nextflow_labels.config'}\n\
.runners[.type == 'nextflow'].config.script := 'includeConfig(\"nextflow_labels.config\"\
)'"
- ".engines += { type: \"native\" }"
- ".engines[.type == 'docker'].target_registry := 'images.viash-hub.com'"
- ".engines[.type == 'docker'].target_tag := 'v0.1.0'"
organization: "vsh"
links:
repository: "https://github.com/openpipelines-bio/openpipeline_qc"
docker_registry: "ghcr.io"

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,126 @@
manifest {
name = 'ingestion_qc/h5mu_to_qc_json'
mainScript = 'main.nf'
nextflowVersion = '!>=20.12.1-edge'
version = 'v0.1.0'
description = 'Takes H5MU files that have been ingested by CellRanger, Xenium or CosMx and processed by the QC workflow, and generates:\n- A JSON file that contains the combined data for the QC report\n- A JSON file that defines the layout and structure of the QC report\n'
author = 'Jakub Majercik, Dorien Roosen, Robrecht Cannoodt, Weiwei Schultz'
}
process.container = 'nextflow/bash:latest'
// detect tempdir
tempDir = java.nio.file.Paths.get(
System.getenv('NXF_TEMP') ?:
System.getenv('VIASH_TEMP') ?:
System.getenv('TEMPDIR') ?:
System.getenv('TMPDIR') ?:
'/tmp'
).toAbsolutePath()
profiles {
no_publish {
process {
withName: '.*' {
publishDir = [
enabled: false
]
}
}
}
mount_temp {
docker.temp = tempDir
podman.temp = tempDir
charliecloud.temp = tempDir
}
docker {
docker.enabled = true
// docker.userEmulation = true
singularity.enabled = false
podman.enabled = false
shifter.enabled = false
charliecloud.enabled = false
}
singularity {
singularity.enabled = true
singularity.autoMounts = true
docker.enabled = false
podman.enabled = false
shifter.enabled = false
charliecloud.enabled = false
}
podman {
podman.enabled = true
docker.enabled = false
singularity.enabled = false
shifter.enabled = false
charliecloud.enabled = false
}
shifter {
shifter.enabled = true
docker.enabled = false
singularity.enabled = false
podman.enabled = false
charliecloud.enabled = false
}
charliecloud {
charliecloud.enabled = true
docker.enabled = false
singularity.enabled = false
podman.enabled = false
shifter.enabled = false
}
}
process{
withLabel: mem1gb { memory = 1000000000.B }
withLabel: mem2gb { memory = 2000000000.B }
withLabel: mem5gb { memory = 5000000000.B }
withLabel: mem10gb { memory = 10000000000.B }
withLabel: mem20gb { memory = 20000000000.B }
withLabel: mem50gb { memory = 50000000000.B }
withLabel: mem100gb { memory = 100000000000.B }
withLabel: mem200gb { memory = 200000000000.B }
withLabel: mem500gb { memory = 500000000000.B }
withLabel: mem1tb { memory = 1000000000000.B }
withLabel: mem2tb { memory = 2000000000000.B }
withLabel: mem5tb { memory = 5000000000000.B }
withLabel: mem10tb { memory = 10000000000000.B }
withLabel: mem20tb { memory = 20000000000000.B }
withLabel: mem50tb { memory = 50000000000000.B }
withLabel: mem100tb { memory = 100000000000000.B }
withLabel: mem200tb { memory = 200000000000000.B }
withLabel: mem500tb { memory = 500000000000000.B }
withLabel: mem1gib { memory = 1073741824.B }
withLabel: mem2gib { memory = 2147483648.B }
withLabel: mem4gib { memory = 4294967296.B }
withLabel: mem8gib { memory = 8589934592.B }
withLabel: mem16gib { memory = 17179869184.B }
withLabel: mem32gib { memory = 34359738368.B }
withLabel: mem64gib { memory = 68719476736.B }
withLabel: mem128gib { memory = 137438953472.B }
withLabel: mem256gib { memory = 274877906944.B }
withLabel: mem512gib { memory = 549755813888.B }
withLabel: mem1tib { memory = 1099511627776.B }
withLabel: mem2tib { memory = 2199023255552.B }
withLabel: mem4tib { memory = 4398046511104.B }
withLabel: mem8tib { memory = 8796093022208.B }
withLabel: mem16tib { memory = 17592186044416.B }
withLabel: mem32tib { memory = 35184372088832.B }
withLabel: mem64tib { memory = 70368744177664.B }
withLabel: mem128tib { memory = 140737488355328.B }
withLabel: mem256tib { memory = 281474976710656.B }
withLabel: mem512tib { memory = 562949953421312.B }
withLabel: cpu1 { cpus = 1 }
withLabel: cpu2 { cpus = 2 }
withLabel: cpu5 { cpus = 5 }
withLabel: cpu10 { cpus = 10 }
withLabel: cpu20 { cpus = 20 }
withLabel: cpu50 { cpus = 50 }
withLabel: cpu100 { cpus = 100 }
withLabel: cpu200 { cpus = 200 }
withLabel: cpu500 { cpus = 500 }
withLabel: cpu1000 { cpus = 1000 }
}
includeConfig("nextflow_labels.config")

View File

@@ -0,0 +1,66 @@
process {
// Default resources for components that hardly do any processing
memory = { 2.GB * task.attempt }
cpus = 1
// Retry for exit codes that have something to do with memory issues
errorStrategy = { task.exitStatus in 137..140 ? 'retry' : 'terminate' }
maxRetries = 3
maxMemory = null
// CPU resources
withLabel: singlecpu { cpus = 1 }
withLabel: lowcpu { cpus = 4 }
withLabel: midcpu { cpus = 10 }
withLabel: highcpu { cpus = 20 }
// Memory resources
withLabel: verylowmem { memory = { get_memory( 4.GB * task.attempt ) } }
withLabel: lowmem { memory = { get_memory( 8.GB * task.attempt ) } }
withLabel: midmem { memory = { get_memory( 16.GB * task.attempt ) } }
withLabel: highmem { memory = { get_memory( 64.GB * task.attempt ) } }
withLabel: veryhighmem { memory = { get_memory( 75.GB * task.attempt ) } }
// Disk space
withLabel: lowdisk {
disk = {process.disk ? process.disk : null}
}
withLabel: middisk {
disk = {process.disk ? process.disk : null}
}
withLabel: highdisk {
disk = {process.disk ? process.disk : null}
}
withLabel: veryhighdisk {
disk = {process.disk ? process.disk : null}
}
// NOTE: The above labels intentionally do not have an effect by default.
// The user should set the disk space requirements by adding the following
// to the compute environment:
//
// withLabel: lowdisk { disk = { 20.GB * task.attempt } }
// withLabel: middisk { disk = { 100.GB * task.attempt } }
// withLabel: highdisk { disk = { 200.GB * task.attempt } }
// withLabel: veryhighdisk { disk = { 500.GB * task.attempt } }
}
def get_memory(to_compare) {
if (!process.containsKey("maxMemory") || !process.maxMemory) {
return to_compare
}
try {
if (process.containsKey("maxRetries") && process.maxRetries && task.attempt == (process.maxRetries as int)) {
return process.maxMemory
}
else if (to_compare.compareTo(process.maxMemory as nextflow.util.MemoryUnit) == 1) {
return max_memory as nextflow.util.MemoryUnit
}
else {
return to_compare
}
} catch (all) {
println "Error processing memory resources. Please check that process.maxMemory '${process.maxMemory}' and process.maxRetries '${process.maxRetries}' are valid!"
System.exit(1)
}
}

View File

@@ -0,0 +1,162 @@
{
"categories": [
{
"name": "Sample QC",
"key": "sample_summary_stats",
"additionalAxes": false,
"defaultFilters": []
},
{
"name": "SampleQC",
"key": "metrics_cellranger_stats",
"additionalAxes": false,
"defaultFilters": [
{
"type": "bar",
"field": "Number_of_reads_in_the_library",
"label": "Number of reads per library",
"description": "Sequencing depth per sample. Higher values generally indicate more comprehensive cell profiling.",
"nBins": 10,
"groupBy": "sample_id",
"xAxisType": "linear",
"yAxisType": "linear"
},
{
"type": "bar",
"field": "Confidently_mapped_reads_in_cells",
"label": "Confidently mapped reads in cells",
"description": "Number of reads that were mapped unambiguously to the reference genome within cell-containing droplets.",
"groupBy": "sample_id",
"nBins": 10,
"yAxisType": "linear"
},
{
"type": "bar",
"field": "Estimated_number_of_cells",
"label": "Estimated number of cells",
"description": "CellRanger's estimate of the number of cells per sample based on the UMI count distribution.",
"groupBy": "sample_id",
"nBins": 10,
"yAxisType": "linear"
},
{
"type": "bar",
"field": "Sequencing_saturation",
"label": "Sequencing saturation",
"description": "Fraction of reads that are duplicates of existing UMIs. Higher values suggest deeper sequencing coverage.",
"groupBy": "sample_id",
"nBins": 10,
"yAxisType": "linear"
}
]
},
{
"name": "Cell RNA QC",
"key": "cell_rna_stats",
"additionalAxes": true,
"defaultFilters": [
{
"type": "histogram",
"field": "total_counts",
"label": "Total UMI per cell",
"description": "Total number of RNA molecules detected per cell. Low values typically indicate empty droplets or low-quality cells that should be filtered out.",
"cutoffMin": null,
"cutoffMax": null,
"zoomMax": null,
"nBins": 50,
"groupBy": "sample_id",
"yAxisType": "linear"
},
{
"type": "histogram",
"field": "num_nonzero_vars",
"label": "Number of non-zero genes per cell",
"description": "Count of unique genes detected in each cell. Low gene counts often indicate poor-quality cells.",
"cutoffMin": null,
"cutoffMax": null,
"zoomMax": null,
"nBins": 50,
"groupBy": "sample_id",
"yAxisType": "linear"
},
{
"type": "histogram",
"field": "fraction_mitochondrial",
"label": "Fraction UMI of mitochondrial genes per cell",
"description": "Proportion of cell's RNA from mitochondrial genes.",
"cutoffMin": null,
"cutoffMax": null,
"nBins": 50,
"groupBy": "sample_id",
"yAxisType": "linear"
},
{
"type": "histogram",
"field": "fraction_ribosomal",
"label": "Fraction UMI of ribosomal genes per cell",
"description": "Proportion of cell's RNA from ribosomal protein genes. Extreme values may indicate stress responses or cell cycle abnormalities.",
"cutoffMin": null,
"cutoffMax": null,
"nBins": 50,
"groupBy": "sample_id",
"yAxisType": "linear"
},
{
"type": "histogram",
"field": "pct_of_counts_in_top_50_vars",
"label": "Fraction UMI in top 50 genes per cell",
"description": "Proportion of RNA molecules from the 50 most-expressed genes in each cell.",
"cutoffMin": null,
"cutoffMax": null,
"nBins": 50,
"groupBy": "sample_id",
"yAxisType": "linear"
},
{
"type": "histogram",
"field": "cellbender_cell_probability",
"label": "CellBender cell probability",
"description": "CellBender's statistical confidence (0-1) that a barcode represents a real cell, with higher values indicating stronger confidence.",
"cutoffMin": null,
"cutoffMax": null,
"nBins": 50,
"groupBy": "sample_id",
"yAxisType": "linear"
},
{
"type": "histogram",
"field": "cellbender_background_fraction",
"label": "CellBender background fraction",
"description": "Estimated percentage of each cell's RNA that comes from the ambient solution rather than the cell itself.",
"cutoffMin": null,
"cutoffMax": null,
"nBins": 50,
"groupBy": "sample_id",
"yAxisType": "linear"
},
{
"type": "histogram",
"field": "cellbender_cell_size",
"label": "CellBender cell size",
"description": "CellBender's estimate of the true number of RNA molecules in each cell after removing ambient contamination. Reflects actual cell RNA content rather than raw UMI counts.",
"cutoffMin": null,
"cutoffMax": null,
"nBins": 50,
"groupBy": "sample_id",
"yAxisType": "linear"
},
{
"type": "histogram",
"field": "cellbender_droplet_efficiency",
"label": "CellBender droplet efficiency",
"description": "CellBender's estimate of how efficiently each droplet captured RNA molecules. Higher values indicate more reliable RNA sampling within individual droplets.",
"cutoffMin": null,
"cutoffMax": null,
"nBins": 50,
"groupBy": "sample_id",
"yAxisType": "linear"
}
]
}
]
}

View File

@@ -0,0 +1,91 @@
{
"categories": [
{
"name": "Sample QC",
"key": "sample_summary_stats",
"additionalAxes": false,
"defaultFilters": []
},
{
"name": "Cell RNA QC",
"key": "cell_rna_stats",
"additionalAxes": true,
"defaultFilters": [
{
"type": "histogram",
"visualizationType": "histogram",
"field": "total_counts",
"label": "Total UMI per cell",
"description": "Total number of RNA molecules detected per cell. Low values typically indicate empty droplets or low-quality cells that should be filtered out.",
"cutoffMin": null,
"cutoffMax": null,
"zoomMax": null,
"nBins": 50,
"groupBy": "sample_id",
"yAxisType": "linear"
},
{
"type": "histogram",
"visualizationType": "histogram",
"field": "num_nonzero_vars",
"label": "Number of non-zero genes per cell",
"description": "Count of unique genes detected in each cell. Low gene counts often indicate poor-quality cells.",
"cutoffMin": null,
"cutoffMax": null,
"zoomMax": null,
"nBins": 50,
"groupBy": "sample_id",
"yAxisType": "linear"
},
{
"type": "histogram",
"visualizationType": "histogram",
"field": "fraction_mitochondrial",
"label": "Fraction UMI of mitochondrial genes per cell",
"description": "Proportion of cell's RNA from mitochondrial genes.",
"cutoffMin": null,
"cutoffMax": null,
"nBins": 50,
"groupBy": "sample_id",
"yAxisType": "linear"
},
{
"type": "histogram",
"visualizationType": "histogram",
"field": "fraction_ribosomal",
"label": "Fraction UMI of ribosomal genes per cell",
"description": "Proportion of cell's RNA from ribosomal protein genes. Extreme values may indicate stress responses or cell cycle abnormalities.",
"cutoffMin": null,
"cutoffMax": null,
"nBins": 50,
"groupBy": "sample_id",
"yAxisType": "linear"
},
{
"type": "histogram",
"visualizationType": "histogram",
"field": "cell_area",
"label": "Segmented cell area",
"description": "Area of the segmented cells.",
"cutoffMin": null,
"cutoffMax": null,
"nBins": 50,
"groupBy": "sample_id",
"yAxisType": "linear"
},
{
"type": "histogram",
"visualizationType": "histogram",
"field": "nucleus_ratio",
"label": "Nucleus Ratio",
"description": "Ratio of the nucleus area to the segmented cell area.",
"cutoffMin": null,
"cutoffMax": null,
"nBins": 50,
"groupBy": "sample_id",
"yAxisType": "linear"
}
]
}
]
}

View File

@@ -0,0 +1,12 @@
def setup_logger():
import logging
from sys import stdout
logger = logging.getLogger()
logger.setLevel(logging.INFO)
console_handler = logging.StreamHandler(stdout)
logFormatter = logging.Formatter("%(asctime)s %(levelname)-8s %(message)s")
console_handler.setFormatter(logFormatter)
logger.addHandler(console_handler)
return logger

View File

@@ -0,0 +1,647 @@
name: "cellbender_remove_background"
namespace: "correction"
version: "2.1.2"
argument_groups:
- name: "Inputs"
arguments:
- type: "file"
name: "--input"
alternatives:
- "-i"
description: "Input h5mu file. Data file on which to run tool. Data must be un-filtered:\
\ it should include empty droplets."
info: null
example:
- "input.h5mu"
must_exist: true
create_parent: true
required: true
direction: "input"
multiple: false
multiple_sep: ";"
- type: "string"
name: "--modality"
description: "List of modalities to process."
info: null
default:
- "rna"
required: false
direction: "input"
multiple: false
multiple_sep: ";"
- name: "Outputs"
arguments:
- type: "file"
name: "--output"
alternatives:
- "-o"
description: "Full count matrix as an h5mu file, with background RNA removed.\
\ This file contains all the original droplet barcodes."
info: null
example:
- "output.h5mu"
must_exist: true
create_parent: true
required: true
direction: "output"
multiple: false
multiple_sep: ";"
- type: "string"
name: "--output_compression"
info: null
example:
- "gzip"
required: false
choices:
- "gzip"
- "lzf"
direction: "input"
multiple: false
multiple_sep: ";"
- type: "string"
name: "--layer_output"
description: "Output layer"
info: null
default:
- "cellbender_corrected"
required: false
direction: "input"
multiple: false
multiple_sep: ";"
- type: "string"
name: "--obs_background_fraction"
info: null
default:
- "cellbender_background_fraction"
required: false
direction: "input"
multiple: false
multiple_sep: ";"
- type: "string"
name: "--obs_cell_probability"
info: null
default:
- "cellbender_cell_probability"
required: false
direction: "input"
multiple: false
multiple_sep: ";"
- type: "string"
name: "--obs_cell_size"
info: null
default:
- "cellbender_cell_size"
required: false
direction: "input"
multiple: false
multiple_sep: ";"
- type: "string"
name: "--obs_droplet_efficiency"
info: null
default:
- "cellbender_droplet_efficiency"
required: false
direction: "input"
multiple: false
multiple_sep: ";"
- type: "string"
name: "--obs_latent_scale"
info: null
default:
- "cellbender_latent_scale"
required: false
direction: "input"
multiple: false
multiple_sep: ";"
- type: "string"
name: "--var_ambient_expression"
info: null
default:
- "cellbender_ambient_expression"
required: false
direction: "input"
multiple: false
multiple_sep: ";"
- type: "string"
name: "--obsm_gene_expression_encoding"
info: null
default:
- "cellbender_gene_expression_encoding"
required: false
direction: "input"
multiple: false
multiple_sep: ";"
- name: "Arguments"
arguments:
- type: "boolean"
name: "--expected_cells_from_qc"
description: "Will use the Cell Ranger QC to determine the estimated number of\
\ cells"
info: null
default:
- false
required: false
direction: "input"
multiple: false
multiple_sep: ";"
- type: "integer"
name: "--expected_cells"
description: "Number of cells expected in the dataset (a rough estimate within\
\ a factor of 2 is sufficient)."
info: null
example:
- 1000
required: false
direction: "input"
multiple: false
multiple_sep: ";"
- type: "integer"
name: "--total_droplets_included"
description: "The number of droplets from the rank-ordered UMI plot\nthat will\
\ have their cell probabilities inferred as an\noutput. Include the droplets\
\ which might contain cells.\nDroplets beyond TOTAL_DROPLETS_INCLUDED should\
\ be\n'surely empty' droplets.\n"
info: null
example:
- 25000
required: false
direction: "input"
multiple: false
multiple_sep: ";"
- type: "integer"
name: "--force_cell_umi_prior"
description: "Ignore CellBender's heuristic prior estimation, and use this prior\
\ for UMI counts in cells."
info: null
required: false
direction: "input"
multiple: false
multiple_sep: ";"
- type: "integer"
name: "--force_empty_umi_prior"
description: "Ignore CellBender's heuristic prior estimation, and use this prior\
\ for UMI counts in empty droplets."
info: null
required: false
direction: "input"
multiple: false
multiple_sep: ";"
- type: "string"
name: "--model"
description: "Which model is being used for count data.\n\n* 'naive' subtracts\
\ the estimated ambient profile.\n* 'simple' does not model either ambient RNA\
\ or random barcode swapping (for debugging purposes -- not recommended).\n\
* 'ambient' assumes background RNA is incorporated into droplets.\n* 'swapping'\
\ assumes background RNA comes from random barcode swapping (via PCR chimeras).\n\
* 'full' uses a combined ambient and swapping model.\n"
info: null
default:
- "full"
required: false
choices:
- "naive"
- "simple"
- "ambient"
- "swapping"
- "full"
direction: "input"
multiple: false
multiple_sep: ";"
- type: "integer"
name: "--epochs"
description: "Number of epochs to train."
info: null
default:
- 150
required: false
direction: "input"
multiple: false
multiple_sep: ";"
- type: "integer"
name: "--low_count_threshold"
description: "Droplets with UMI counts below this number are completely \nexcluded\
\ from the analysis. This can help identify the correct \nprior for empty droplet\
\ counts in the rare case where empty \ncounts are extremely high (over 200).\n"
info: null
default:
- 5
required: false
direction: "input"
multiple: false
multiple_sep: ";"
- type: "integer"
name: "--z_dim"
description: "Dimension of latent variable z.\n"
info: null
default:
- 64
required: false
direction: "input"
multiple: false
multiple_sep: ";"
- type: "integer"
name: "--z_layers"
description: "Dimension of hidden layers in the encoder for z.\n"
info: null
default:
- 512
required: false
direction: "input"
multiple: true
multiple_sep: ";"
- type: "double"
name: "--training_fraction"
description: "Training detail: the fraction of the data used for training.\nThe\
\ rest is never seen by the inference algorithm. Speeds up learning.\n"
info: null
default:
- 0.9
required: false
direction: "input"
multiple: false
multiple_sep: ";"
- type: "double"
name: "--empty_drop_training_fraction"
description: "Training detail: the fraction of the training data each epoch that\
\ \nis drawn (randomly sampled) from surely empty droplets.\n"
info: null
default:
- 0.2
required: false
direction: "input"
multiple: false
multiple_sep: ";"
- type: "integer"
name: "--ignore_features"
description: "Integer indices of features to ignore entirely. In the output\n\
count matrix, the counts for these features will be unchanged.\n"
info: null
required: false
direction: "input"
multiple: true
multiple_sep: ";"
- type: "double"
name: "--fpr"
description: "Target 'delta' false positive rate in [0, 1). Use 0 for a cohort\n\
of samples which will be jointly analyzed for differential expression.\nA false\
\ positive is a true signal count that is erroneously removed.\nMore background\
\ removal is accompanied by more signal removal at\nhigh values of FPR. You\
\ can specify multiple values, which will\ncreate multiple output files.\n"
info: null
default:
- 0.01
required: false
direction: "input"
multiple: true
multiple_sep: ";"
- type: "string"
name: "--exclude_feature_types"
description: "Feature types to ignore during the analysis. These features will\n\
be left unchanged in the output file.\n"
info: null
required: false
direction: "input"
multiple: true
multiple_sep: ";"
- type: "double"
name: "--projected_ambient_count_threshold"
description: "Controls how many features are included in the analysis, which\n\
can lead to a large speedup. If a feature is expected to have less\nthan PROJECTED_AMBIENT_COUNT_THRESHOLD\
\ counts total in all cells\n(summed), then that gene is excluded, and it will\
\ be unchanged\nin the output count matrix. For example, \nPROJECTED_AMBIENT_COUNT_THRESHOLD\
\ = 0 will include all features\nwhich have even a single count in any empty\
\ droplet.\n"
info: null
default:
- 0.1
required: false
direction: "input"
multiple: false
multiple_sep: ";"
- type: "double"
name: "--learning_rate"
description: "Training detail: lower learning rate for inference.\nA OneCycle\
\ learning rate schedule is used, where the\nupper learning rate is ten times\
\ this value. (For this\nvalue, probably do not exceed 1e-3).\n"
info: null
default:
- 1.0E-4
required: false
direction: "input"
multiple: false
multiple_sep: ";"
- type: "double"
name: "--final_elbo_fail_fraction"
description: "Training is considered to have failed if \n(best_test_ELBO - final_test_ELBO)/(best_test_ELBO\
\ - initial_test_ELBO) > FINAL_ELBO_FAIL_FRACTION.\nTraining will automatically\
\ re-run if --num-training-tries > 1.\nBy default, will not fail training based\
\ on final_training_ELBO.\n"
info: null
required: false
direction: "input"
multiple: false
multiple_sep: ";"
- type: "double"
name: "--epoch_elbo_fail_fraction"
description: "Training is considered to have failed if \n(previous_epoch_test_ELBO\
\ - current_epoch_test_ELBO)/(previous_epoch_test_ELBO - initial_train_ELBO)\
\ > EPOCH_ELBO_FAIL_FRACTION.\nTraining will automatically re-run if --num-training-tries\
\ > 1.\nBy default, will not fail training based on epoch_training_ELBO.\n"
info: null
required: false
direction: "input"
multiple: false
multiple_sep: ";"
- type: "integer"
name: "--num_training_tries"
description: "Number of times to attempt to train the model. At each subsequent\
\ attempt,\nthe learning rate is multiplied by LEARNING_RATE_RETRY_MULT.\n"
info: null
default:
- 1
required: false
direction: "input"
multiple: false
multiple_sep: ";"
- type: "double"
name: "--learning_rate_retry_mult"
description: "Learning rate is multiplied by this amount each time a new training\n\
attempt is made. (This parameter is only used if training fails based\non EPOCH_ELBO_FAIL_FRACTION\
\ or FINAL_ELBO_FAIL_FRACTION and\nNUM_TRAINING_TRIES is > 1.) \n"
info: null
default:
- 0.2
required: false
direction: "input"
multiple: false
multiple_sep: ";"
- type: "integer"
name: "--posterior_batch_size"
description: "Training detail: size of batches when creating the posterior.\n\
Reduce this to avoid running out of GPU memory creating the posterior\n(will\
\ be slower).\n"
info: null
default:
- 128
required: false
direction: "input"
multiple: false
multiple_sep: ";"
- type: "string"
name: "--posterior_regulation"
description: "Posterior regularization method. (For experts: not required for\
\ normal usage,\nsee documentation). \n\n* PRq is approximate quantile-targeting.\n\
* PRmu is approximate mean-targeting aggregated over genes (behavior of v0.2.0).\n\
* PRmu_gene is approximate mean-targeting per gene.\n"
info: null
required: false
choices:
- "PRq"
- "PRmu"
- "PRmu_gene"
direction: "input"
multiple: false
multiple_sep: ";"
- type: "double"
name: "--alpha"
description: "Tunable parameter alpha for the PRq posterior regularization method\n\
(not normally used: see documentation).\n"
info: null
required: false
direction: "input"
multiple: false
multiple_sep: ";"
- type: "double"
name: "--q"
description: "Tunable parameter q for the CDF threshold estimation method (not\n\
normally used: see documentation).\n"
info: null
required: false
direction: "input"
multiple: false
multiple_sep: ";"
- type: "string"
name: "--estimator"
description: "Output denoised count estimation method. (For experts: not required\n\
for normal usage, see documentation).\n"
info: null
default:
- "mckp"
required: false
choices:
- "map"
- "mean"
- "cdf"
- "sample"
- "mckp"
direction: "input"
multiple: false
multiple_sep: ";"
- type: "boolean_true"
name: "--estimator_multiple_cpu"
description: "Including the flag --estimator-multiple-cpu will use more than one\n\
CPU to compute the MCKP output count estimator in parallel (does nothing\nfor\
\ other estimators).\n"
info: null
direction: "input"
- type: "boolean"
name: "--constant_learning_rate"
description: "Including the flag --constant-learning-rate will use the ClippedAdam\n\
optimizer instead of the OneCycleLR learning rate schedule, which is\nthe default.\
\ Learning is faster with the OneCycleLR schedule.\nHowever, training can easily\
\ be continued from a checkpoint for more\nepochs than the initial command specified\
\ when using ClippedAdam. On\nthe other hand, if using the OneCycleLR schedule\
\ with 150 epochs\nspecified, it is not possible to pick up from that final\
\ checkpoint\nand continue training until 250 epochs.\n"
info: null
required: false
direction: "input"
multiple: false
multiple_sep: ";"
- type: "boolean_true"
name: "--debug"
description: "Including the flag --debug will log extra messages useful for debugging.\n"
info: null
direction: "input"
- type: "boolean_true"
name: "--cuda"
description: "Including the flag --cuda will run the inference on a\nGPU.\n"
info: null
direction: "input"
resources:
- type: "python_script"
path: "script.py"
is_executable: true
- type: "file"
path: "setup_logger.py"
- type: "file"
path: "nextflow_labels.config"
dest: "nextflow_labels.config"
description: "Eliminating technical artifacts from high-throughput single-cell RNA\
\ sequencing data.\n\nThis module removes counts due to ambient RNA molecules and\
\ random barcode swapping from (raw) UMI-based scRNA-seq count matrices. \nAt the\
\ moment, only the count matrices produced by the CellRanger count pipeline is supported.\
\ Support for additional tools and protocols \nwill be added in the future. A quick\
\ start tutorial can be found here.\n\nFleming et al. 2022, bioRxiv.\n"
test_resources:
- type: "python_script"
path: "test.py"
is_executable: true
- type: "file"
path: "pbmc_1k_protein_v3_filtered_feature_bc_matrix.h5mu"
info: null
status: "enabled"
scope:
image: "public"
target: "public"
license: "MIT"
links:
repository: "https://github.com/openpipelines-bio/openpipeline"
docker_registry: "ghcr.io"
runners:
- type: "executable"
id: "executable"
docker_setup_strategy: "ifneedbepullelsecachedbuild"
- type: "nextflow"
id: "nextflow"
directives:
label:
- "midcpu"
- "midmem"
- "gpu"
tag: "$id"
auto:
simplifyInput: true
simplifyOutput: false
transcript: false
publish: false
config:
labels:
mem1gb: "memory = 1000000000.B"
mem2gb: "memory = 2000000000.B"
mem5gb: "memory = 5000000000.B"
mem10gb: "memory = 10000000000.B"
mem20gb: "memory = 20000000000.B"
mem50gb: "memory = 50000000000.B"
mem100gb: "memory = 100000000000.B"
mem200gb: "memory = 200000000000.B"
mem500gb: "memory = 500000000000.B"
mem1tb: "memory = 1000000000000.B"
mem2tb: "memory = 2000000000000.B"
mem5tb: "memory = 5000000000000.B"
mem10tb: "memory = 10000000000000.B"
mem20tb: "memory = 20000000000000.B"
mem50tb: "memory = 50000000000000.B"
mem100tb: "memory = 100000000000000.B"
mem200tb: "memory = 200000000000000.B"
mem500tb: "memory = 500000000000000.B"
mem1gib: "memory = 1073741824.B"
mem2gib: "memory = 2147483648.B"
mem4gib: "memory = 4294967296.B"
mem8gib: "memory = 8589934592.B"
mem16gib: "memory = 17179869184.B"
mem32gib: "memory = 34359738368.B"
mem64gib: "memory = 68719476736.B"
mem128gib: "memory = 137438953472.B"
mem256gib: "memory = 274877906944.B"
mem512gib: "memory = 549755813888.B"
mem1tib: "memory = 1099511627776.B"
mem2tib: "memory = 2199023255552.B"
mem4tib: "memory = 4398046511104.B"
mem8tib: "memory = 8796093022208.B"
mem16tib: "memory = 17592186044416.B"
mem32tib: "memory = 35184372088832.B"
mem64tib: "memory = 70368744177664.B"
mem128tib: "memory = 140737488355328.B"
mem256tib: "memory = 281474976710656.B"
mem512tib: "memory = 562949953421312.B"
cpu1: "cpus = 1"
cpu2: "cpus = 2"
cpu5: "cpus = 5"
cpu10: "cpus = 10"
cpu20: "cpus = 20"
cpu50: "cpus = 50"
cpu100: "cpus = 100"
cpu200: "cpus = 200"
cpu500: "cpus = 500"
cpu1000: "cpus = 1000"
script:
- "includeConfig(\"nextflow_labels.config\")"
debug: false
container: "docker"
engines:
- type: "docker"
id: "docker"
image: "nvcr.io/nvidia/cuda:11.8.0-devel-ubuntu22.04"
target_tag: "2.1.0"
namespace_separator: "/"
setup:
- type: "docker"
run:
- "apt update && DEBIAN_FRONTEND=noninteractive apt install -y make build-essential\
\ libssl-dev zlib1g-dev libbz2-dev libreadline-dev libsqlite3-dev wget ca-certificates\
\ curl llvm libncurses5-dev xz-utils tk-dev libxml2-dev libxmlsec1-dev libffi-dev\
\ liblzma-dev mecab-ipadic-utf8 git \\\n&& curl https://pyenv.run | bash \\\n\
&& pyenv update \\\n&& pyenv install $PYTHON_VERSION \\\n&& pyenv global $PYTHON_VERSION\
\ \\\n&& apt-get clean\n"
env:
- "PYENV_ROOT=\"/root/.pyenv\""
- "PATH=\"$PYENV_ROOT/shims:$PYENV_ROOT/bin:$PATH\""
- "PYTHON_VERSION=3.7.16"
- type: "python"
user: false
packages:
- "lxml~=4.8.0"
- "mudata~=0.2.1"
- "cellbender~=0.3.0"
upgrade: true
entrypoint: []
cmd: null
build_info:
config: "src/correction/cellbender_remove_background/config.vsh.yaml"
runner: "nextflow"
engine: "docker"
output: "target/nextflow/correction/cellbender_remove_background"
executable: "target/nextflow/correction/cellbender_remove_background/main.nf"
viash_version: "0.9.4"
git_commit: "a0c9522486585774f76416150f8a3291409b5363"
git_remote: "https://github.com/openpipelines-bio/openpipeline"
git_tag: "2.1.1-2-ga0c95224865"
package_config:
name: "openpipeline"
summary: "Best-practice workflows for single-cell multi-omics analyses.\n"
description: "OpenPipelines are extensible single cell analysis pipelines for reproducible\
\ and large-scale single cell processing using [Viash](https://viash.io) and [Nextflow](https://www.nextflow.io/).\n\
\nIn terms of workflows, the following has been made available, but keep in mind\
\ that\nindividual tools and functionality can be executed as standalone components\
\ as well.\n\n * Demultiplexing: conversion of raw sequencing data to FASTQ objects.\n\
\ * Ingestion: Read mapping and generating a count matrix.\n * Single sample\
\ processing: cell filtering and doublet detection.\n * Multisample processing:\
\ Count transformation, normalization, QC metric calulations.\n * Integration:\
\ Clustering, integration and batch correction using single and multimodal methods.\n\
\ * Downstream analysis workflows\n"
info:
test_resources:
- type: "s3"
path: "s3://openpipelines-data"
dest: "resources_test"
viash_version: "0.9.4"
source: "src"
target: "target"
config_mods:
- ".resources += {path: '/src/workflows/utils/labels.config', dest: 'nextflow_labels.config'}\n\
.runners[.type == 'nextflow'].config.script := 'includeConfig(\"nextflow_labels.config\"\
)'"
- ".version := \"2.1.2\""
- ".engines[.type == 'docker'].target_tag := '2.1.0'"
keywords:
- "single-cell"
- "multimodal"
license: "MIT"
organization: "openpipelines-bio"
links:
repository: "https://github.com/openpipelines-bio/openpipeline"
docker_registry: "ghcr.io"
homepage: "https://openpipelines.bio"
documentation: "https://openpipelines.bio/fundamentals"
issue_tracker: "https://github.com/openpipelines-bio/openpipeline/issues"

View File

@@ -0,0 +1,125 @@
manifest {
name = 'correction/cellbender_remove_background'
mainScript = 'main.nf'
nextflowVersion = '!>=20.12.1-edge'
version = '2.1.2'
description = 'Eliminating technical artifacts from high-throughput single-cell RNA sequencing data.\n\nThis module removes counts due to ambient RNA molecules and random barcode swapping from (raw) UMI-based scRNA-seq count matrices. \nAt the moment, only the count matrices produced by the CellRanger count pipeline is supported. Support for additional tools and protocols \nwill be added in the future. A quick start tutorial can be found here.\n\nFleming et al. 2022, bioRxiv.\n'
}
process.container = 'nextflow/bash:latest'
// detect tempdir
tempDir = java.nio.file.Paths.get(
System.getenv('NXF_TEMP') ?:
System.getenv('VIASH_TEMP') ?:
System.getenv('TEMPDIR') ?:
System.getenv('TMPDIR') ?:
'/tmp'
).toAbsolutePath()
profiles {
no_publish {
process {
withName: '.*' {
publishDir = [
enabled: false
]
}
}
}
mount_temp {
docker.temp = tempDir
podman.temp = tempDir
charliecloud.temp = tempDir
}
docker {
docker.enabled = true
// docker.userEmulation = true
singularity.enabled = false
podman.enabled = false
shifter.enabled = false
charliecloud.enabled = false
}
singularity {
singularity.enabled = true
singularity.autoMounts = true
docker.enabled = false
podman.enabled = false
shifter.enabled = false
charliecloud.enabled = false
}
podman {
podman.enabled = true
docker.enabled = false
singularity.enabled = false
shifter.enabled = false
charliecloud.enabled = false
}
shifter {
shifter.enabled = true
docker.enabled = false
singularity.enabled = false
podman.enabled = false
charliecloud.enabled = false
}
charliecloud {
charliecloud.enabled = true
docker.enabled = false
singularity.enabled = false
podman.enabled = false
shifter.enabled = false
}
}
process{
withLabel: mem1gb { memory = 1000000000.B }
withLabel: mem2gb { memory = 2000000000.B }
withLabel: mem5gb { memory = 5000000000.B }
withLabel: mem10gb { memory = 10000000000.B }
withLabel: mem20gb { memory = 20000000000.B }
withLabel: mem50gb { memory = 50000000000.B }
withLabel: mem100gb { memory = 100000000000.B }
withLabel: mem200gb { memory = 200000000000.B }
withLabel: mem500gb { memory = 500000000000.B }
withLabel: mem1tb { memory = 1000000000000.B }
withLabel: mem2tb { memory = 2000000000000.B }
withLabel: mem5tb { memory = 5000000000000.B }
withLabel: mem10tb { memory = 10000000000000.B }
withLabel: mem20tb { memory = 20000000000000.B }
withLabel: mem50tb { memory = 50000000000000.B }
withLabel: mem100tb { memory = 100000000000000.B }
withLabel: mem200tb { memory = 200000000000000.B }
withLabel: mem500tb { memory = 500000000000000.B }
withLabel: mem1gib { memory = 1073741824.B }
withLabel: mem2gib { memory = 2147483648.B }
withLabel: mem4gib { memory = 4294967296.B }
withLabel: mem8gib { memory = 8589934592.B }
withLabel: mem16gib { memory = 17179869184.B }
withLabel: mem32gib { memory = 34359738368.B }
withLabel: mem64gib { memory = 68719476736.B }
withLabel: mem128gib { memory = 137438953472.B }
withLabel: mem256gib { memory = 274877906944.B }
withLabel: mem512gib { memory = 549755813888.B }
withLabel: mem1tib { memory = 1099511627776.B }
withLabel: mem2tib { memory = 2199023255552.B }
withLabel: mem4tib { memory = 4398046511104.B }
withLabel: mem8tib { memory = 8796093022208.B }
withLabel: mem16tib { memory = 17592186044416.B }
withLabel: mem32tib { memory = 35184372088832.B }
withLabel: mem64tib { memory = 70368744177664.B }
withLabel: mem128tib { memory = 140737488355328.B }
withLabel: mem256tib { memory = 281474976710656.B }
withLabel: mem512tib { memory = 562949953421312.B }
withLabel: cpu1 { cpus = 1 }
withLabel: cpu2 { cpus = 2 }
withLabel: cpu5 { cpus = 5 }
withLabel: cpu10 { cpus = 10 }
withLabel: cpu20 { cpus = 20 }
withLabel: cpu50 { cpus = 50 }
withLabel: cpu100 { cpus = 100 }
withLabel: cpu200 { cpus = 200 }
withLabel: cpu500 { cpus = 500 }
withLabel: cpu1000 { cpus = 1000 }
}
includeConfig("nextflow_labels.config")

View File

@@ -0,0 +1,66 @@
process {
// Default resources for components that hardly do any processing
memory = { 2.GB * task.attempt }
cpus = 1
// Retry for exit codes that have something to do with memory issues
errorStrategy = { task.exitStatus in 137..140 ? 'retry' : 'terminate' }
maxRetries = 3
maxMemory = null
// CPU resources
withLabel: singlecpu { cpus = 1 }
withLabel: lowcpu { cpus = 4 }
withLabel: midcpu { cpus = 10 }
withLabel: highcpu { cpus = 20 }
// Memory resources
withLabel: lowmem { memory = { get_memory( 4.GB * task.attempt ) } }
withLabel: midmem { memory = { get_memory( 25.GB * task.attempt ) } }
withLabel: highmem { memory = { get_memory( 50.GB * task.attempt ) } }
withLabel: veryhighmem { memory = { get_memory( 75.GB * task.attempt ) } }
// Disk space
withLabel: lowdisk {
disk = {process.disk ? process.disk : null}
}
withLabel: middisk {
disk = {process.disk ? process.disk : null}
}
withLabel: highdisk {
disk = {process.disk ? process.disk : null}
}
withLabel: veryhighdisk {
disk = {process.disk ? process.disk : null}
}
// NOTE: The above labels intentionally do not have an effect by default.
// The user should set the disk space requirements by adding the following
// to the compute environment:
//
// withLabel: lowdisk { disk = { 20.GB * task.attempt } }
// withLabel: middisk { disk = { 100.GB * task.attempt } }
// withLabel: highdisk { disk = { 200.GB * task.attempt } }
// withLabel: veryhighdisk { disk = { 500.GB * task.attempt } }
}
def get_memory(to_compare) {
if (!process.containsKey("maxMemory") || !process.maxMemory) {
return to_compare
}
try {
if (process.containsKey("maxRetries") && process.maxRetries && task.attempt == (process.maxRetries as int)) {
return process.maxMemory
}
else if (to_compare.compareTo(process.maxMemory as nextflow.util.MemoryUnit) == 1) {
return max_memory as nextflow.util.MemoryUnit
}
else {
return to_compare
}
} catch (all) {
println "Error processing memory resources. Please check that process.maxMemory '${process.maxMemory}' and process.maxRetries '${process.maxRetries}' are valid!"
System.exit(1)
}
}

View File

@@ -0,0 +1,51 @@
# Inputs
input: # please fill in - example: "input.h5mu"
modality: "rna"
# Outputs
# output: "$id.$key.output.h5mu"
# output_compression: "gzip"
layer_output: "cellbender_corrected"
obs_background_fraction: "cellbender_background_fraction"
obs_cell_probability: "cellbender_cell_probability"
obs_cell_size: "cellbender_cell_size"
obs_droplet_efficiency: "cellbender_droplet_efficiency"
obs_latent_scale: "cellbender_latent_scale"
var_ambient_expression: "cellbender_ambient_expression"
obsm_gene_expression_encoding: "cellbender_gene_expression_encoding"
# Arguments
expected_cells_from_qc: false
# expected_cells: 1000
# total_droplets_included: 25000
# force_cell_umi_prior: 123
# force_empty_umi_prior: 123
model: "full"
epochs: 150
low_count_threshold: 5
z_dim: 64
z_layers: [512]
training_fraction: 0.9
empty_drop_training_fraction: 0.2
# ignore_features: [123]
fpr: [0.01]
# exclude_feature_types: ["foo"]
projected_ambient_count_threshold: 0.1
learning_rate: 1.0E-4
# final_elbo_fail_fraction: 123.0
# epoch_elbo_fail_fraction: 123.0
num_training_tries: 1
learning_rate_retry_mult: 0.2
posterior_batch_size: 128
# posterior_regulation: "foo"
# alpha: 123.0
# q: 123.0
estimator: "mckp"
estimator_multiple_cpu: false
# constant_learning_rate: true
debug: false
cuda: false
# Nextflow input-output arguments
publish_dir: # please fill in - example: "output/"
# param_list: "my_params.yaml"

View File

@@ -0,0 +1,551 @@
{
"$schema": "http://json-schema.org/draft-07/schema",
"title": "cellbender_remove_background",
"description": "Eliminating technical artifacts from high-throughput single-cell RNA sequencing data.\n\nThis module removes counts due to ambient RNA molecules and random barcode swapping from (raw) UMI-based scRNA-seq count matrices. \nAt the moment, only the count matrices produced by the CellRanger count pipeline is supported. Support for additional tools and protocols \nwill be added in the future. A quick start tutorial can be found here.\n\nFleming et al. 2022, bioRxiv.\n",
"type": "object",
"definitions": {
"Dataset input": {
"title": "Dataset input",
"type": "object",
"description": "Dataset input using nf-tower \"dataset\" or \"data explorer\". Allows for the input of multiple parameter sets to initialise a Nextflow channel.",
"properties": {
"param_list": {
"description": "Dataset input can either be a list of maps, a csv file, a json file, a yaml file, or simply a yaml blob. The names of the input fields (e.g. csv columns, json keys) need to be an exact match with the workflow input parameters.",
"default": "",
"format": "file-path",
"mimetype": "text/csv",
"pattern": "^\\S+\\.csv$"
}
}
},
"inputs" : {
"title": "Inputs",
"type": "object",
"description": "No description",
"properties": {
"input": {
"type":
"string",
"description": "Type: `file`, required, example: `input.h5mu`. Input h5mu file",
"help_text": "Type: `file`, required, example: `input.h5mu`. Input h5mu file. Data file on which to run tool. Data must be un-filtered: it should include empty droplets."
}
,
"modality": {
"type":
"string",
"description": "Type: `string`, default: `rna`. List of modalities to process",
"help_text": "Type: `string`, default: `rna`. List of modalities to process."
,
"default":"rna"
}
}
},
"outputs" : {
"title": "Outputs",
"type": "object",
"description": "No description",
"properties": {
"output": {
"type":
"string",
"description": "Type: `file`, required, default: `$id.$key.output.h5mu`, example: `output.h5mu`. Full count matrix as an h5mu file, with background RNA removed",
"help_text": "Type: `file`, required, default: `$id.$key.output.h5mu`, example: `output.h5mu`. Full count matrix as an h5mu file, with background RNA removed. This file contains all the original droplet barcodes."
,
"default":"$id.$key.output.h5mu"
}
,
"output_compression": {
"type":
"string",
"description": "Type: `string`, example: `gzip`, choices: ``gzip`, `lzf``. ",
"help_text": "Type: `string`, example: `gzip`, choices: ``gzip`, `lzf``. ",
"enum": ["gzip", "lzf"]
}
,
"layer_output": {
"type":
"string",
"description": "Type: `string`, default: `cellbender_corrected`. Output layer",
"help_text": "Type: `string`, default: `cellbender_corrected`. Output layer"
,
"default":"cellbender_corrected"
}
,
"obs_background_fraction": {
"type":
"string",
"description": "Type: `string`, default: `cellbender_background_fraction`. ",
"help_text": "Type: `string`, default: `cellbender_background_fraction`. "
,
"default":"cellbender_background_fraction"
}
,
"obs_cell_probability": {
"type":
"string",
"description": "Type: `string`, default: `cellbender_cell_probability`. ",
"help_text": "Type: `string`, default: `cellbender_cell_probability`. "
,
"default":"cellbender_cell_probability"
}
,
"obs_cell_size": {
"type":
"string",
"description": "Type: `string`, default: `cellbender_cell_size`. ",
"help_text": "Type: `string`, default: `cellbender_cell_size`. "
,
"default":"cellbender_cell_size"
}
,
"obs_droplet_efficiency": {
"type":
"string",
"description": "Type: `string`, default: `cellbender_droplet_efficiency`. ",
"help_text": "Type: `string`, default: `cellbender_droplet_efficiency`. "
,
"default":"cellbender_droplet_efficiency"
}
,
"obs_latent_scale": {
"type":
"string",
"description": "Type: `string`, default: `cellbender_latent_scale`. ",
"help_text": "Type: `string`, default: `cellbender_latent_scale`. "
,
"default":"cellbender_latent_scale"
}
,
"var_ambient_expression": {
"type":
"string",
"description": "Type: `string`, default: `cellbender_ambient_expression`. ",
"help_text": "Type: `string`, default: `cellbender_ambient_expression`. "
,
"default":"cellbender_ambient_expression"
}
,
"obsm_gene_expression_encoding": {
"type":
"string",
"description": "Type: `string`, default: `cellbender_gene_expression_encoding`. ",
"help_text": "Type: `string`, default: `cellbender_gene_expression_encoding`. "
,
"default":"cellbender_gene_expression_encoding"
}
}
},
"arguments" : {
"title": "Arguments",
"type": "object",
"description": "No description",
"properties": {
"expected_cells_from_qc": {
"type":
"boolean",
"description": "Type: `boolean`, default: `false`. Will use the Cell Ranger QC to determine the estimated number of cells",
"help_text": "Type: `boolean`, default: `false`. Will use the Cell Ranger QC to determine the estimated number of cells"
,
"default":false
}
,
"expected_cells": {
"type":
"integer",
"description": "Type: `integer`, example: `1000`. Number of cells expected in the dataset (a rough estimate within a factor of 2 is sufficient)",
"help_text": "Type: `integer`, example: `1000`. Number of cells expected in the dataset (a rough estimate within a factor of 2 is sufficient)."
}
,
"total_droplets_included": {
"type":
"integer",
"description": "Type: `integer`, example: `25000`. The number of droplets from the rank-ordered UMI plot\nthat will have their cell probabilities inferred as an\noutput",
"help_text": "Type: `integer`, example: `25000`. The number of droplets from the rank-ordered UMI plot\nthat will have their cell probabilities inferred as an\noutput. Include the droplets which might contain cells.\nDroplets beyond TOTAL_DROPLETS_INCLUDED should be\n\u0027surely empty\u0027 droplets.\n"
}
,
"force_cell_umi_prior": {
"type":
"integer",
"description": "Type: `integer`. Ignore CellBender\u0027s heuristic prior estimation, and use this prior for UMI counts in cells",
"help_text": "Type: `integer`. Ignore CellBender\u0027s heuristic prior estimation, and use this prior for UMI counts in cells."
}
,
"force_empty_umi_prior": {
"type":
"integer",
"description": "Type: `integer`. Ignore CellBender\u0027s heuristic prior estimation, and use this prior for UMI counts in empty droplets",
"help_text": "Type: `integer`. Ignore CellBender\u0027s heuristic prior estimation, and use this prior for UMI counts in empty droplets."
}
,
"model": {
"type":
"string",
"description": "Type: `string`, default: `full`, choices: ``naive`, `simple`, `ambient`, `swapping`, `full``. Which model is being used for count data",
"help_text": "Type: `string`, default: `full`, choices: ``naive`, `simple`, `ambient`, `swapping`, `full``. Which model is being used for count data.\n\n* \u0027naive\u0027 subtracts the estimated ambient profile.\n* \u0027simple\u0027 does not model either ambient RNA or random barcode swapping (for debugging purposes -- not recommended).\n* \u0027ambient\u0027 assumes background RNA is incorporated into droplets.\n* \u0027swapping\u0027 assumes background RNA comes from random barcode swapping (via PCR chimeras).\n* \u0027full\u0027 uses a combined ambient and swapping model.\n",
"enum": ["naive", "simple", "ambient", "swapping", "full"]
,
"default":"full"
}
,
"epochs": {
"type":
"integer",
"description": "Type: `integer`, default: `150`. Number of epochs to train",
"help_text": "Type: `integer`, default: `150`. Number of epochs to train."
,
"default":150
}
,
"low_count_threshold": {
"type":
"integer",
"description": "Type: `integer`, default: `5`. Droplets with UMI counts below this number are completely \nexcluded from the analysis",
"help_text": "Type: `integer`, default: `5`. Droplets with UMI counts below this number are completely \nexcluded from the analysis. This can help identify the correct \nprior for empty droplet counts in the rare case where empty \ncounts are extremely high (over 200).\n"
,
"default":5
}
,
"z_dim": {
"type":
"integer",
"description": "Type: `integer`, default: `64`. Dimension of latent variable z",
"help_text": "Type: `integer`, default: `64`. Dimension of latent variable z.\n"
,
"default":64
}
,
"z_layers": {
"type":
"string",
"description": "Type: List of `integer`, default: `512`, multiple_sep: `\";\"`. Dimension of hidden layers in the encoder for z",
"help_text": "Type: List of `integer`, default: `512`, multiple_sep: `\";\"`. Dimension of hidden layers in the encoder for z.\n"
,
"default":"512"
}
,
"training_fraction": {
"type":
"number",
"description": "Type: `double`, default: `0.9`. Training detail: the fraction of the data used for training",
"help_text": "Type: `double`, default: `0.9`. Training detail: the fraction of the data used for training.\nThe rest is never seen by the inference algorithm. Speeds up learning.\n"
,
"default":0.9
}
,
"empty_drop_training_fraction": {
"type":
"number",
"description": "Type: `double`, default: `0.2`. Training detail: the fraction of the training data each epoch that \nis drawn (randomly sampled) from surely empty droplets",
"help_text": "Type: `double`, default: `0.2`. Training detail: the fraction of the training data each epoch that \nis drawn (randomly sampled) from surely empty droplets.\n"
,
"default":0.2
}
,
"ignore_features": {
"type":
"string",
"description": "Type: List of `integer`, multiple_sep: `\";\"`. Integer indices of features to ignore entirely",
"help_text": "Type: List of `integer`, multiple_sep: `\";\"`. Integer indices of features to ignore entirely. In the output\ncount matrix, the counts for these features will be unchanged.\n"
}
,
"fpr": {
"type":
"string",
"description": "Type: List of `double`, default: `0.01`, multiple_sep: `\";\"`. Target \u0027delta\u0027 false positive rate in [0, 1)",
"help_text": "Type: List of `double`, default: `0.01`, multiple_sep: `\";\"`. Target \u0027delta\u0027 false positive rate in [0, 1). Use 0 for a cohort\nof samples which will be jointly analyzed for differential expression.\nA false positive is a true signal count that is erroneously removed.\nMore background removal is accompanied by more signal removal at\nhigh values of FPR. You can specify multiple values, which will\ncreate multiple output files.\n"
,
"default":"0.01"
}
,
"exclude_feature_types": {
"type":
"string",
"description": "Type: List of `string`, multiple_sep: `\";\"`. Feature types to ignore during the analysis",
"help_text": "Type: List of `string`, multiple_sep: `\";\"`. Feature types to ignore during the analysis. These features will\nbe left unchanged in the output file.\n"
}
,
"projected_ambient_count_threshold": {
"type":
"number",
"description": "Type: `double`, default: `0.1`. Controls how many features are included in the analysis, which\ncan lead to a large speedup",
"help_text": "Type: `double`, default: `0.1`. Controls how many features are included in the analysis, which\ncan lead to a large speedup. If a feature is expected to have less\nthan PROJECTED_AMBIENT_COUNT_THRESHOLD counts total in all cells\n(summed), then that gene is excluded, and it will be unchanged\nin the output count matrix. For example, \nPROJECTED_AMBIENT_COUNT_THRESHOLD = 0 will include all features\nwhich have even a single count in any empty droplet.\n"
,
"default":0.1
}
,
"learning_rate": {
"type":
"number",
"description": "Type: `double`, default: `1.0E-4`. Training detail: lower learning rate for inference",
"help_text": "Type: `double`, default: `1.0E-4`. Training detail: lower learning rate for inference.\nA OneCycle learning rate schedule is used, where the\nupper learning rate is ten times this value. (For this\nvalue, probably do not exceed 1e-3).\n"
,
"default":0.0001
}
,
"final_elbo_fail_fraction": {
"type":
"number",
"description": "Type: `double`. Training is considered to have failed if \n(best_test_ELBO - final_test_ELBO)/(best_test_ELBO - initial_test_ELBO) \u003e FINAL_ELBO_FAIL_FRACTION",
"help_text": "Type: `double`. Training is considered to have failed if \n(best_test_ELBO - final_test_ELBO)/(best_test_ELBO - initial_test_ELBO) \u003e FINAL_ELBO_FAIL_FRACTION.\nTraining will automatically re-run if --num-training-tries \u003e 1.\nBy default, will not fail training based on final_training_ELBO.\n"
}
,
"epoch_elbo_fail_fraction": {
"type":
"number",
"description": "Type: `double`. Training is considered to have failed if \n(previous_epoch_test_ELBO - current_epoch_test_ELBO)/(previous_epoch_test_ELBO - initial_train_ELBO) \u003e EPOCH_ELBO_FAIL_FRACTION",
"help_text": "Type: `double`. Training is considered to have failed if \n(previous_epoch_test_ELBO - current_epoch_test_ELBO)/(previous_epoch_test_ELBO - initial_train_ELBO) \u003e EPOCH_ELBO_FAIL_FRACTION.\nTraining will automatically re-run if --num-training-tries \u003e 1.\nBy default, will not fail training based on epoch_training_ELBO.\n"
}
,
"num_training_tries": {
"type":
"integer",
"description": "Type: `integer`, default: `1`. Number of times to attempt to train the model",
"help_text": "Type: `integer`, default: `1`. Number of times to attempt to train the model. At each subsequent attempt,\nthe learning rate is multiplied by LEARNING_RATE_RETRY_MULT.\n"
,
"default":1
}
,
"learning_rate_retry_mult": {
"type":
"number",
"description": "Type: `double`, default: `0.2`. Learning rate is multiplied by this amount each time a new training\nattempt is made",
"help_text": "Type: `double`, default: `0.2`. Learning rate is multiplied by this amount each time a new training\nattempt is made. (This parameter is only used if training fails based\non EPOCH_ELBO_FAIL_FRACTION or FINAL_ELBO_FAIL_FRACTION and\nNUM_TRAINING_TRIES is \u003e 1.) \n"
,
"default":0.2
}
,
"posterior_batch_size": {
"type":
"integer",
"description": "Type: `integer`, default: `128`. Training detail: size of batches when creating the posterior",
"help_text": "Type: `integer`, default: `128`. Training detail: size of batches when creating the posterior.\nReduce this to avoid running out of GPU memory creating the posterior\n(will be slower).\n"
,
"default":128
}
,
"posterior_regulation": {
"type":
"string",
"description": "Type: `string`, choices: ``PRq`, `PRmu`, `PRmu_gene``. Posterior regularization method",
"help_text": "Type: `string`, choices: ``PRq`, `PRmu`, `PRmu_gene``. Posterior regularization method. (For experts: not required for normal usage,\nsee documentation). \n\n* PRq is approximate quantile-targeting.\n* PRmu is approximate mean-targeting aggregated over genes (behavior of v0.2.0).\n* PRmu_gene is approximate mean-targeting per gene.\n",
"enum": ["PRq", "PRmu", "PRmu_gene"]
}
,
"alpha": {
"type":
"number",
"description": "Type: `double`. Tunable parameter alpha for the PRq posterior regularization method\n(not normally used: see documentation)",
"help_text": "Type: `double`. Tunable parameter alpha for the PRq posterior regularization method\n(not normally used: see documentation).\n"
}
,
"q": {
"type":
"number",
"description": "Type: `double`. Tunable parameter q for the CDF threshold estimation method (not\nnormally used: see documentation)",
"help_text": "Type: `double`. Tunable parameter q for the CDF threshold estimation method (not\nnormally used: see documentation).\n"
}
,
"estimator": {
"type":
"string",
"description": "Type: `string`, default: `mckp`, choices: ``map`, `mean`, `cdf`, `sample`, `mckp``. Output denoised count estimation method",
"help_text": "Type: `string`, default: `mckp`, choices: ``map`, `mean`, `cdf`, `sample`, `mckp``. Output denoised count estimation method. (For experts: not required\nfor normal usage, see documentation).\n",
"enum": ["map", "mean", "cdf", "sample", "mckp"]
,
"default":"mckp"
}
,
"estimator_multiple_cpu": {
"type":
"boolean",
"description": "Type: `boolean_true`, default: `false`. Including the flag --estimator-multiple-cpu will use more than one\nCPU to compute the MCKP output count estimator in parallel (does nothing\nfor other estimators)",
"help_text": "Type: `boolean_true`, default: `false`. Including the flag --estimator-multiple-cpu will use more than one\nCPU to compute the MCKP output count estimator in parallel (does nothing\nfor other estimators).\n"
,
"default":false
}
,
"constant_learning_rate": {
"type":
"boolean",
"description": "Type: `boolean`. Including the flag --constant-learning-rate will use the ClippedAdam\noptimizer instead of the OneCycleLR learning rate schedule, which is\nthe default",
"help_text": "Type: `boolean`. Including the flag --constant-learning-rate will use the ClippedAdam\noptimizer instead of the OneCycleLR learning rate schedule, which is\nthe default. Learning is faster with the OneCycleLR schedule.\nHowever, training can easily be continued from a checkpoint for more\nepochs than the initial command specified when using ClippedAdam. On\nthe other hand, if using the OneCycleLR schedule with 150 epochs\nspecified, it is not possible to pick up from that final checkpoint\nand continue training until 250 epochs.\n"
}
,
"debug": {
"type":
"boolean",
"description": "Type: `boolean_true`, default: `false`. Including the flag --debug will log extra messages useful for debugging",
"help_text": "Type: `boolean_true`, default: `false`. Including the flag --debug will log extra messages useful for debugging.\n"
,
"default":false
}
,
"cuda": {
"type":
"boolean",
"description": "Type: `boolean_true`, default: `false`. Including the flag --cuda will run the inference on a\nGPU",
"help_text": "Type: `boolean_true`, default: `false`. Including the flag --cuda will run the inference on a\nGPU.\n"
,
"default":false
}
}
},
"nextflow input-output arguments" : {
"title": "Nextflow input-output arguments",
"type": "object",
"description": "Input/output parameters for Nextflow itself. Please note that both publishDir and publish_dir are supported but at least one has to be configured.",
"properties": {
"publish_dir": {
"type":
"string",
"description": "Type: `string`, required, example: `output/`. Path to an output directory",
"help_text": "Type: `string`, required, example: `output/`. Path to an output directory."
}
}
}
},
"allOf": [
{
"$ref": "#/definitions/inputs"
},
{
"$ref": "#/definitions/outputs"
},
{
"$ref": "#/definitions/arguments"
},
{
"$ref": "#/definitions/nextflow input-output arguments"
}
]
}

View File

@@ -0,0 +1,12 @@
def setup_logger():
import logging
from sys import stdout
logger = logging.getLogger()
logger.setLevel(logging.INFO)
console_handler = logging.StreamHandler(stdout)
logFormatter = logging.Formatter("%(asctime)s %(levelname)-8s %(message)s")
console_handler.setFormatter(logFormatter)
logger.addHandler(console_handler)
return logger

View File

@@ -0,0 +1,262 @@
name: "add_id"
namespace: "metadata"
version: "2.1.2"
authors:
- name: "Dries Schaumont"
roles:
- "maintainer"
info:
role: "Core Team Member"
links:
email: "dries@data-intuitive.com"
github: "DriesSchaumont"
orcid: "0000-0002-4389-0440"
linkedin: "dries-schaumont"
organizations:
- name: "Data Intuitive"
href: "https://www.data-intuitive.com"
role: "Data Scientist"
argument_groups:
- name: "Arguments"
arguments:
- type: "file"
name: "--input"
alternatives:
- "-i"
description: "Path to the input .h5mu."
info: null
example:
- "sample_path"
must_exist: true
create_parent: true
required: true
direction: "input"
multiple: false
multiple_sep: ";"
- type: "string"
name: "--input_id"
description: "The input id."
info: null
required: true
direction: "input"
multiple: false
multiple_sep: ";"
- type: "string"
name: "--obs_output"
description: "Name of the .obs column where to store the id."
info: null
default:
- "sample_id"
required: false
direction: "input"
multiple: false
multiple_sep: ";"
- type: "file"
name: "--output"
alternatives:
- "-o"
info: null
example:
- "output.h5mu"
must_exist: true
create_parent: true
required: false
direction: "output"
multiple: false
multiple_sep: ";"
- type: "string"
name: "--output_compression"
description: "The compression format to be used on the output h5mu object."
info: null
example:
- "gzip"
required: false
choices:
- "gzip"
- "lzf"
direction: "input"
multiple: false
multiple_sep: ";"
- type: "boolean_true"
name: "--make_observation_keys_unique"
description: "Join the id to the .obs index (.obs_names)."
info: null
direction: "input"
resources:
- type: "python_script"
path: "script.py"
is_executable: true
- type: "file"
path: "setup_logger.py"
- type: "file"
path: "nextflow_labels.config"
dest: "nextflow_labels.config"
description: "Add id of .obs. Also allows to make .obs_names (the .obs index) unique\
\ \nby prefixing the values with an unique id per .h5mu file.\n"
test_resources:
- type: "python_script"
path: "test.py"
is_executable: true
- type: "file"
path: "e18_mouse_brain_fresh_5k_filtered_feature_bc_matrix_subset_unique_obs.h5mu"
info: null
status: "enabled"
scope:
image: "public"
target: "public"
license: "MIT"
links:
repository: "https://github.com/openpipelines-bio/openpipeline"
docker_registry: "ghcr.io"
runners:
- type: "executable"
id: "executable"
docker_setup_strategy: "ifneedbepullelsecachedbuild"
- type: "nextflow"
id: "nextflow"
directives:
label:
- "singlecpu"
- "lowmem"
tag: "$id"
auto:
simplifyInput: true
simplifyOutput: false
transcript: false
publish: false
config:
labels:
mem1gb: "memory = 1000000000.B"
mem2gb: "memory = 2000000000.B"
mem5gb: "memory = 5000000000.B"
mem10gb: "memory = 10000000000.B"
mem20gb: "memory = 20000000000.B"
mem50gb: "memory = 50000000000.B"
mem100gb: "memory = 100000000000.B"
mem200gb: "memory = 200000000000.B"
mem500gb: "memory = 500000000000.B"
mem1tb: "memory = 1000000000000.B"
mem2tb: "memory = 2000000000000.B"
mem5tb: "memory = 5000000000000.B"
mem10tb: "memory = 10000000000000.B"
mem20tb: "memory = 20000000000000.B"
mem50tb: "memory = 50000000000000.B"
mem100tb: "memory = 100000000000000.B"
mem200tb: "memory = 200000000000000.B"
mem500tb: "memory = 500000000000000.B"
mem1gib: "memory = 1073741824.B"
mem2gib: "memory = 2147483648.B"
mem4gib: "memory = 4294967296.B"
mem8gib: "memory = 8589934592.B"
mem16gib: "memory = 17179869184.B"
mem32gib: "memory = 34359738368.B"
mem64gib: "memory = 68719476736.B"
mem128gib: "memory = 137438953472.B"
mem256gib: "memory = 274877906944.B"
mem512gib: "memory = 549755813888.B"
mem1tib: "memory = 1099511627776.B"
mem2tib: "memory = 2199023255552.B"
mem4tib: "memory = 4398046511104.B"
mem8tib: "memory = 8796093022208.B"
mem16tib: "memory = 17592186044416.B"
mem32tib: "memory = 35184372088832.B"
mem64tib: "memory = 70368744177664.B"
mem128tib: "memory = 140737488355328.B"
mem256tib: "memory = 281474976710656.B"
mem512tib: "memory = 562949953421312.B"
cpu1: "cpus = 1"
cpu2: "cpus = 2"
cpu5: "cpus = 5"
cpu10: "cpus = 10"
cpu20: "cpus = 20"
cpu50: "cpus = 50"
cpu100: "cpus = 100"
cpu200: "cpus = 200"
cpu500: "cpus = 500"
cpu1000: "cpus = 1000"
script:
- "includeConfig(\"nextflow_labels.config\")"
debug: false
container: "docker"
engines:
- type: "docker"
id: "docker"
image: "python:3.11-slim"
target_tag: "2.1.0"
namespace_separator: "/"
setup:
- type: "apt"
packages:
- "procps"
interactive: false
- type: "python"
user: false
packages:
- "anndata~=0.11.1"
- "mudata~=0.3.1"
script:
- "exec(\"try:\\n import awkward\\nexcept ModuleNotFoundError:\\n exit(0)\\\
nelse: exit(1)\")"
upgrade: true
test_setup:
- type: "apt"
packages:
- "git"
interactive: false
- type: "python"
user: false
packages:
- "viashpy==0.8.0"
github:
- "openpipelines-bio/core#subdirectory=packages/python/openpipeline_testutils"
upgrade: true
entrypoint: []
cmd: null
build_info:
config: "src/metadata/add_id/config.vsh.yaml"
runner: "nextflow"
engine: "docker"
output: "target/nextflow/metadata/add_id"
executable: "target/nextflow/metadata/add_id/main.nf"
viash_version: "0.9.4"
git_commit: "a0c9522486585774f76416150f8a3291409b5363"
git_remote: "https://github.com/openpipelines-bio/openpipeline"
git_tag: "2.1.1-2-ga0c95224865"
package_config:
name: "openpipeline"
summary: "Best-practice workflows for single-cell multi-omics analyses.\n"
description: "OpenPipelines are extensible single cell analysis pipelines for reproducible\
\ and large-scale single cell processing using [Viash](https://viash.io) and [Nextflow](https://www.nextflow.io/).\n\
\nIn terms of workflows, the following has been made available, but keep in mind\
\ that\nindividual tools and functionality can be executed as standalone components\
\ as well.\n\n * Demultiplexing: conversion of raw sequencing data to FASTQ objects.\n\
\ * Ingestion: Read mapping and generating a count matrix.\n * Single sample\
\ processing: cell filtering and doublet detection.\n * Multisample processing:\
\ Count transformation, normalization, QC metric calulations.\n * Integration:\
\ Clustering, integration and batch correction using single and multimodal methods.\n\
\ * Downstream analysis workflows\n"
info:
test_resources:
- type: "s3"
path: "s3://openpipelines-data"
dest: "resources_test"
viash_version: "0.9.4"
source: "src"
target: "target"
config_mods:
- ".resources += {path: '/src/workflows/utils/labels.config', dest: 'nextflow_labels.config'}\n\
.runners[.type == 'nextflow'].config.script := 'includeConfig(\"nextflow_labels.config\"\
)'"
- ".version := \"2.1.2\""
- ".engines[.type == 'docker'].target_tag := '2.1.0'"
keywords:
- "single-cell"
- "multimodal"
license: "MIT"
organization: "openpipelines-bio"
links:
repository: "https://github.com/openpipelines-bio/openpipeline"
docker_registry: "ghcr.io"
homepage: "https://openpipelines.bio"
documentation: "https://openpipelines.bio/fundamentals"
issue_tracker: "https://github.com/openpipelines-bio/openpipeline/issues"

View File

@@ -0,0 +1,126 @@
manifest {
name = 'metadata/add_id'
mainScript = 'main.nf'
nextflowVersion = '!>=20.12.1-edge'
version = '2.1.2'
description = 'Add id of .obs. Also allows to make .obs_names (the .obs index) unique \nby prefixing the values with an unique id per .h5mu file.\n'
author = 'Dries Schaumont'
}
process.container = 'nextflow/bash:latest'
// detect tempdir
tempDir = java.nio.file.Paths.get(
System.getenv('NXF_TEMP') ?:
System.getenv('VIASH_TEMP') ?:
System.getenv('TEMPDIR') ?:
System.getenv('TMPDIR') ?:
'/tmp'
).toAbsolutePath()
profiles {
no_publish {
process {
withName: '.*' {
publishDir = [
enabled: false
]
}
}
}
mount_temp {
docker.temp = tempDir
podman.temp = tempDir
charliecloud.temp = tempDir
}
docker {
docker.enabled = true
// docker.userEmulation = true
singularity.enabled = false
podman.enabled = false
shifter.enabled = false
charliecloud.enabled = false
}
singularity {
singularity.enabled = true
singularity.autoMounts = true
docker.enabled = false
podman.enabled = false
shifter.enabled = false
charliecloud.enabled = false
}
podman {
podman.enabled = true
docker.enabled = false
singularity.enabled = false
shifter.enabled = false
charliecloud.enabled = false
}
shifter {
shifter.enabled = true
docker.enabled = false
singularity.enabled = false
podman.enabled = false
charliecloud.enabled = false
}
charliecloud {
charliecloud.enabled = true
docker.enabled = false
singularity.enabled = false
podman.enabled = false
shifter.enabled = false
}
}
process{
withLabel: mem1gb { memory = 1000000000.B }
withLabel: mem2gb { memory = 2000000000.B }
withLabel: mem5gb { memory = 5000000000.B }
withLabel: mem10gb { memory = 10000000000.B }
withLabel: mem20gb { memory = 20000000000.B }
withLabel: mem50gb { memory = 50000000000.B }
withLabel: mem100gb { memory = 100000000000.B }
withLabel: mem200gb { memory = 200000000000.B }
withLabel: mem500gb { memory = 500000000000.B }
withLabel: mem1tb { memory = 1000000000000.B }
withLabel: mem2tb { memory = 2000000000000.B }
withLabel: mem5tb { memory = 5000000000000.B }
withLabel: mem10tb { memory = 10000000000000.B }
withLabel: mem20tb { memory = 20000000000000.B }
withLabel: mem50tb { memory = 50000000000000.B }
withLabel: mem100tb { memory = 100000000000000.B }
withLabel: mem200tb { memory = 200000000000000.B }
withLabel: mem500tb { memory = 500000000000000.B }
withLabel: mem1gib { memory = 1073741824.B }
withLabel: mem2gib { memory = 2147483648.B }
withLabel: mem4gib { memory = 4294967296.B }
withLabel: mem8gib { memory = 8589934592.B }
withLabel: mem16gib { memory = 17179869184.B }
withLabel: mem32gib { memory = 34359738368.B }
withLabel: mem64gib { memory = 68719476736.B }
withLabel: mem128gib { memory = 137438953472.B }
withLabel: mem256gib { memory = 274877906944.B }
withLabel: mem512gib { memory = 549755813888.B }
withLabel: mem1tib { memory = 1099511627776.B }
withLabel: mem2tib { memory = 2199023255552.B }
withLabel: mem4tib { memory = 4398046511104.B }
withLabel: mem8tib { memory = 8796093022208.B }
withLabel: mem16tib { memory = 17592186044416.B }
withLabel: mem32tib { memory = 35184372088832.B }
withLabel: mem64tib { memory = 70368744177664.B }
withLabel: mem128tib { memory = 140737488355328.B }
withLabel: mem256tib { memory = 281474976710656.B }
withLabel: mem512tib { memory = 562949953421312.B }
withLabel: cpu1 { cpus = 1 }
withLabel: cpu2 { cpus = 2 }
withLabel: cpu5 { cpus = 5 }
withLabel: cpu10 { cpus = 10 }
withLabel: cpu20 { cpus = 20 }
withLabel: cpu50 { cpus = 50 }
withLabel: cpu100 { cpus = 100 }
withLabel: cpu200 { cpus = 200 }
withLabel: cpu500 { cpus = 500 }
withLabel: cpu1000 { cpus = 1000 }
}
includeConfig("nextflow_labels.config")

View File

@@ -0,0 +1,66 @@
process {
// Default resources for components that hardly do any processing
memory = { 2.GB * task.attempt }
cpus = 1
// Retry for exit codes that have something to do with memory issues
errorStrategy = { task.exitStatus in 137..140 ? 'retry' : 'terminate' }
maxRetries = 3
maxMemory = null
// CPU resources
withLabel: singlecpu { cpus = 1 }
withLabel: lowcpu { cpus = 4 }
withLabel: midcpu { cpus = 10 }
withLabel: highcpu { cpus = 20 }
// Memory resources
withLabel: lowmem { memory = { get_memory( 4.GB * task.attempt ) } }
withLabel: midmem { memory = { get_memory( 25.GB * task.attempt ) } }
withLabel: highmem { memory = { get_memory( 50.GB * task.attempt ) } }
withLabel: veryhighmem { memory = { get_memory( 75.GB * task.attempt ) } }
// Disk space
withLabel: lowdisk {
disk = {process.disk ? process.disk : null}
}
withLabel: middisk {
disk = {process.disk ? process.disk : null}
}
withLabel: highdisk {
disk = {process.disk ? process.disk : null}
}
withLabel: veryhighdisk {
disk = {process.disk ? process.disk : null}
}
// NOTE: The above labels intentionally do not have an effect by default.
// The user should set the disk space requirements by adding the following
// to the compute environment:
//
// withLabel: lowdisk { disk = { 20.GB * task.attempt } }
// withLabel: middisk { disk = { 100.GB * task.attempt } }
// withLabel: highdisk { disk = { 200.GB * task.attempt } }
// withLabel: veryhighdisk { disk = { 500.GB * task.attempt } }
}
def get_memory(to_compare) {
if (!process.containsKey("maxMemory") || !process.maxMemory) {
return to_compare
}
try {
if (process.containsKey("maxRetries") && process.maxRetries && task.attempt == (process.maxRetries as int)) {
return process.maxMemory
}
else if (to_compare.compareTo(process.maxMemory as nextflow.util.MemoryUnit) == 1) {
return max_memory as nextflow.util.MemoryUnit
}
else {
return to_compare
}
} catch (all) {
println "Error processing memory resources. Please check that process.maxMemory '${process.maxMemory}' and process.maxRetries '${process.maxRetries}' are valid!"
System.exit(1)
}
}

View File

@@ -0,0 +1,11 @@
# Arguments
input: # please fill in - example: "sample_path"
input_id: # please fill in - example: "foo"
obs_output: "sample_id"
# output: "$id.$key.output.h5mu"
# output_compression: "gzip"
make_observation_keys_unique: false
# Nextflow input-output arguments
publish_dir: # please fill in - example: "output/"
# param_list: "my_params.yaml"

View File

@@ -0,0 +1,132 @@
{
"$schema": "http://json-schema.org/draft-07/schema",
"title": "add_id",
"description": "Add id of .obs. Also allows to make .obs_names (the .obs index) unique \nby prefixing the values with an unique id per .h5mu file.\n",
"type": "object",
"definitions": {
"Dataset input": {
"title": "Dataset input",
"type": "object",
"description": "Dataset input using nf-tower \"dataset\" or \"data explorer\". Allows for the input of multiple parameter sets to initialise a Nextflow channel.",
"properties": {
"param_list": {
"description": "Dataset input can either be a list of maps, a csv file, a json file, a yaml file, or simply a yaml blob. The names of the input fields (e.g. csv columns, json keys) need to be an exact match with the workflow input parameters.",
"default": "",
"format": "file-path",
"mimetype": "text/csv",
"pattern": "^\\S+\\.csv$"
}
}
},
"arguments" : {
"title": "Arguments",
"type": "object",
"description": "No description",
"properties": {
"input": {
"type":
"string",
"description": "Type: `file`, required, example: `sample_path`. Path to the input ",
"help_text": "Type: `file`, required, example: `sample_path`. Path to the input .h5mu."
}
,
"input_id": {
"type":
"string",
"description": "Type: `string`, required. The input id",
"help_text": "Type: `string`, required. The input id."
}
,
"obs_output": {
"type":
"string",
"description": "Type: `string`, default: `sample_id`. Name of the ",
"help_text": "Type: `string`, default: `sample_id`. Name of the .obs column where to store the id."
,
"default":"sample_id"
}
,
"output": {
"type":
"string",
"description": "Type: `file`, default: `$id.$key.output.h5mu`, example: `output.h5mu`. ",
"help_text": "Type: `file`, default: `$id.$key.output.h5mu`, example: `output.h5mu`. "
,
"default":"$id.$key.output.h5mu"
}
,
"output_compression": {
"type":
"string",
"description": "Type: `string`, example: `gzip`, choices: ``gzip`, `lzf``. The compression format to be used on the output h5mu object",
"help_text": "Type: `string`, example: `gzip`, choices: ``gzip`, `lzf``. The compression format to be used on the output h5mu object.",
"enum": ["gzip", "lzf"]
}
,
"make_observation_keys_unique": {
"type":
"boolean",
"description": "Type: `boolean_true`, default: `false`. Join the id to the ",
"help_text": "Type: `boolean_true`, default: `false`. Join the id to the .obs index (.obs_names)."
,
"default":false
}
}
},
"nextflow input-output arguments" : {
"title": "Nextflow input-output arguments",
"type": "object",
"description": "Input/output parameters for Nextflow itself. Please note that both publishDir and publish_dir are supported but at least one has to be configured.",
"properties": {
"publish_dir": {
"type":
"string",
"description": "Type: `string`, required, example: `output/`. Path to an output directory",
"help_text": "Type: `string`, required, example: `output/`. Path to an output directory."
}
}
}
},
"allOf": [
{
"$ref": "#/definitions/arguments"
},
{
"$ref": "#/definitions/nextflow input-output arguments"
}
]
}

View File

@@ -0,0 +1,12 @@
def setup_logger():
import logging
from sys import stdout
logger = logging.getLogger()
logger.setLevel(logging.INFO)
console_handler = logging.StreamHandler(stdout)
logFormatter = logging.Formatter("%(asctime)s %(levelname)-8s %(message)s")
console_handler.setFormatter(logFormatter)
logger.addHandler(console_handler)
return logger

View File

@@ -0,0 +1,318 @@
name: "grep_annotation_column"
namespace: "metadata"
version: "2.1.2"
authors:
- name: "Dries Schaumont"
roles:
- "maintainer"
info:
role: "Core Team Member"
links:
email: "dries@data-intuitive.com"
github: "DriesSchaumont"
orcid: "0000-0002-4389-0440"
linkedin: "dries-schaumont"
organizations:
- name: "Data Intuitive"
href: "https://www.data-intuitive.com"
role: "Data Scientist"
argument_groups:
- name: "Inputs"
description: "Arguments related to the input dataset."
arguments:
- type: "file"
name: "--input"
alternatives:
- "-i"
description: "Path to the input .h5mu."
info: null
example:
- "sample_path"
must_exist: true
create_parent: true
required: true
direction: "input"
multiple: false
multiple_sep: ";"
- type: "string"
name: "--input_column"
description: "Column to query. If not specified, use .var_names or .obs_names,\
\ depending on the value of --matrix"
info: null
required: false
direction: "input"
multiple: false
multiple_sep: ";"
- type: "string"
name: "--input_layer"
description: "Input data to use when calculating fraction of observations that\
\ match with the query. \nOnly used when --output_fraction_column is provided.\
\ If not specified, .X is used.\n"
info: null
required: false
direction: "input"
multiple: false
multiple_sep: ";"
- type: "string"
name: "--modality"
description: "Which modality to get the annotation matrix from.\n"
info: null
example:
- "rna"
required: true
direction: "input"
multiple: false
multiple_sep: ";"
- type: "string"
name: "--matrix"
description: "Matrix to fetch the column from that will be searched."
info: null
example:
- "var"
required: false
choices:
- "var"
- "obs"
direction: "input"
multiple: false
multiple_sep: ";"
- name: "Outputs"
description: "Arguments related to how the output will be written."
arguments:
- type: "file"
name: "--output"
alternatives:
- "-o"
info: null
example:
- "output.h5mu"
must_exist: true
create_parent: true
required: false
direction: "output"
multiple: false
multiple_sep: ";"
- type: "string"
name: "--output_compression"
description: "The compression format to be used on the output h5mu object."
info: null
example:
- "gzip"
required: false
choices:
- "gzip"
- "lzf"
direction: "input"
multiple: false
multiple_sep: ";"
- type: "string"
name: "--output_match_column"
description: "Name of the column to write the result to."
info: null
required: true
direction: "input"
multiple: false
multiple_sep: ";"
- type: "string"
name: "--output_fraction_column"
description: "For the opposite axis, name of the column to write the fraction\
\ of \nobservations that matches to the pattern.\n"
info: null
required: false
direction: "input"
multiple: false
multiple_sep: ";"
- name: "Query options"
description: "Options related to the query"
arguments:
- type: "string"
name: "--regex_pattern"
description: "Regex to use to match with the input column."
info: null
example:
- "^[mM][tT]-"
required: true
direction: "input"
multiple: false
multiple_sep: ";"
resources:
- type: "python_script"
path: "script.py"
is_executable: true
- type: "file"
path: "setup_logger.py"
- type: "file"
path: "compress_h5mu.py"
- type: "file"
path: "nextflow_labels.config"
dest: "nextflow_labels.config"
description: "Perform a regex lookup on a column from the annotation matrices .obs\
\ or .var.\nThe annotation matrix can originate from either a modality, or all modalities\
\ (global .var or .obs).\n"
test_resources:
- type: "python_script"
path: "test.py"
is_executable: true
- type: "file"
path: "e18_mouse_brain_fresh_5k_filtered_feature_bc_matrix_subset_unique_obs.h5mu"
info: null
status: "enabled"
scope:
image: "public"
target: "public"
license: "MIT"
links:
repository: "https://github.com/openpipelines-bio/openpipeline"
docker_registry: "ghcr.io"
runners:
- type: "executable"
id: "executable"
docker_setup_strategy: "ifneedbepullelsecachedbuild"
- type: "nextflow"
id: "nextflow"
directives:
label:
- "singlecpu"
- "lowmem"
tag: "$id"
auto:
simplifyInput: true
simplifyOutput: false
transcript: false
publish: false
config:
labels:
mem1gb: "memory = 1000000000.B"
mem2gb: "memory = 2000000000.B"
mem5gb: "memory = 5000000000.B"
mem10gb: "memory = 10000000000.B"
mem20gb: "memory = 20000000000.B"
mem50gb: "memory = 50000000000.B"
mem100gb: "memory = 100000000000.B"
mem200gb: "memory = 200000000000.B"
mem500gb: "memory = 500000000000.B"
mem1tb: "memory = 1000000000000.B"
mem2tb: "memory = 2000000000000.B"
mem5tb: "memory = 5000000000000.B"
mem10tb: "memory = 10000000000000.B"
mem20tb: "memory = 20000000000000.B"
mem50tb: "memory = 50000000000000.B"
mem100tb: "memory = 100000000000000.B"
mem200tb: "memory = 200000000000000.B"
mem500tb: "memory = 500000000000000.B"
mem1gib: "memory = 1073741824.B"
mem2gib: "memory = 2147483648.B"
mem4gib: "memory = 4294967296.B"
mem8gib: "memory = 8589934592.B"
mem16gib: "memory = 17179869184.B"
mem32gib: "memory = 34359738368.B"
mem64gib: "memory = 68719476736.B"
mem128gib: "memory = 137438953472.B"
mem256gib: "memory = 274877906944.B"
mem512gib: "memory = 549755813888.B"
mem1tib: "memory = 1099511627776.B"
mem2tib: "memory = 2199023255552.B"
mem4tib: "memory = 4398046511104.B"
mem8tib: "memory = 8796093022208.B"
mem16tib: "memory = 17592186044416.B"
mem32tib: "memory = 35184372088832.B"
mem64tib: "memory = 70368744177664.B"
mem128tib: "memory = 140737488355328.B"
mem256tib: "memory = 281474976710656.B"
mem512tib: "memory = 562949953421312.B"
cpu1: "cpus = 1"
cpu2: "cpus = 2"
cpu5: "cpus = 5"
cpu10: "cpus = 10"
cpu20: "cpus = 20"
cpu50: "cpus = 50"
cpu100: "cpus = 100"
cpu200: "cpus = 200"
cpu500: "cpus = 500"
cpu1000: "cpus = 1000"
script:
- "includeConfig(\"nextflow_labels.config\")"
debug: false
container: "docker"
engines:
- type: "docker"
id: "docker"
image: "python:3.11-slim"
target_tag: "2.1.0"
namespace_separator: "/"
setup:
- type: "apt"
packages:
- "procps"
interactive: false
- type: "python"
user: false
packages:
- "anndata~=0.11.1"
- "mudata~=0.3.1"
script:
- "exec(\"try:\\n import awkward\\nexcept ModuleNotFoundError:\\n exit(0)\\\
nelse: exit(1)\")"
upgrade: true
test_setup:
- type: "apt"
packages:
- "git"
interactive: false
- type: "python"
user: false
packages:
- "viashpy==0.8.0"
github:
- "openpipelines-bio/core#subdirectory=packages/python/openpipeline_testutils"
upgrade: true
entrypoint: []
cmd: null
build_info:
config: "src/metadata/grep_annotation_column/config.vsh.yaml"
runner: "nextflow"
engine: "docker"
output: "target/nextflow/metadata/grep_annotation_column"
executable: "target/nextflow/metadata/grep_annotation_column/main.nf"
viash_version: "0.9.4"
git_commit: "a0c9522486585774f76416150f8a3291409b5363"
git_remote: "https://github.com/openpipelines-bio/openpipeline"
git_tag: "2.1.1-2-ga0c95224865"
package_config:
name: "openpipeline"
summary: "Best-practice workflows for single-cell multi-omics analyses.\n"
description: "OpenPipelines are extensible single cell analysis pipelines for reproducible\
\ and large-scale single cell processing using [Viash](https://viash.io) and [Nextflow](https://www.nextflow.io/).\n\
\nIn terms of workflows, the following has been made available, but keep in mind\
\ that\nindividual tools and functionality can be executed as standalone components\
\ as well.\n\n * Demultiplexing: conversion of raw sequencing data to FASTQ objects.\n\
\ * Ingestion: Read mapping and generating a count matrix.\n * Single sample\
\ processing: cell filtering and doublet detection.\n * Multisample processing:\
\ Count transformation, normalization, QC metric calulations.\n * Integration:\
\ Clustering, integration and batch correction using single and multimodal methods.\n\
\ * Downstream analysis workflows\n"
info:
test_resources:
- type: "s3"
path: "s3://openpipelines-data"
dest: "resources_test"
viash_version: "0.9.4"
source: "src"
target: "target"
config_mods:
- ".resources += {path: '/src/workflows/utils/labels.config', dest: 'nextflow_labels.config'}\n\
.runners[.type == 'nextflow'].config.script := 'includeConfig(\"nextflow_labels.config\"\
)'"
- ".version := \"2.1.2\""
- ".engines[.type == 'docker'].target_tag := '2.1.0'"
keywords:
- "single-cell"
- "multimodal"
license: "MIT"
organization: "openpipelines-bio"
links:
repository: "https://github.com/openpipelines-bio/openpipeline"
docker_registry: "ghcr.io"
homepage: "https://openpipelines.bio"
documentation: "https://openpipelines.bio/fundamentals"
issue_tracker: "https://github.com/openpipelines-bio/openpipeline/issues"

View File

@@ -0,0 +1,87 @@
import shutil
from anndata import AnnData
from mudata import write_h5ad
from h5py import File as H5File
from h5py import Group, Dataset
from pathlib import Path
from typing import Union, Literal
from functools import partial
def compress_h5mu(
input_path: Union[str, Path],
output_path: Union[str, Path],
compression: Union[Literal["gzip"], Literal["lzf"]],
):
input_path, output_path = str(input_path), str(output_path)
def copy_attributes(in_object, out_object):
for key, value in in_object.attrs.items():
out_object.attrs[key] = value
def visit_path(
output_h5: H5File,
compression: Union[Literal["gzip"], Literal["lzf"]],
name: str,
object: Union[Group, Dataset],
):
if isinstance(object, Group):
new_group = output_h5.create_group(name)
copy_attributes(object, new_group)
elif isinstance(object, Dataset):
# Compression only works for non-scalar Dataset objects
# Scalar objects dont have a shape defined
if not object.compression and object.shape not in [None, ()]:
new_dataset = output_h5.create_dataset(
name, data=object, compression=compression
)
copy_attributes(object, new_dataset)
else:
output_h5.copy(object, name)
else:
raise NotImplementedError(
f"Could not copy element {name}, "
f"type has not been implemented yet: {type(object)}"
)
with (
H5File(input_path, "r") as input_h5,
H5File(output_path, "w", userblock_size=512) as output_h5,
):
copy_attributes(input_h5, output_h5)
input_h5.visititems(partial(visit_path, output_h5, compression))
with open(input_path, "rb") as input_bytes:
# Mudata puts metadata like this in the first 512 bytes:
# MuData (format-version=0.1.0;creator=muon;creator-version=0.2.0)
# See mudata/_core/io.py, read_h5mu() function
starting_metadata = input_bytes.read(100)
# The metadata is padded with extra null bytes up until 512 bytes
truncate_location = starting_metadata.find(b"\x00")
starting_metadata = starting_metadata[:truncate_location]
with open(output_path, "br+") as f:
nbytes = f.write(starting_metadata)
f.write(b"\0" * (512 - nbytes))
def write_h5ad_to_h5mu_with_compression(
output_file: Union[str, Path],
h5mu: Union[str, Path],
modality_name: str,
modality_data: AnnData,
output_compression=None,
):
output_file = Path(output_file)
h5mu = Path(h5mu)
output_file_uncompressed = (
output_file.with_name(output_file.stem + "_uncompressed.h5mu")
if output_compression
else output_file
)
shutil.copyfile(h5mu, output_file_uncompressed)
write_h5ad(filename=output_file_uncompressed, mod=modality_name, data=modality_data)
if output_compression:
compress_h5mu(
output_file_uncompressed, output_file, compression=output_compression
)
output_file_uncompressed.unlink()

View File

@@ -0,0 +1,126 @@
manifest {
name = 'metadata/grep_annotation_column'
mainScript = 'main.nf'
nextflowVersion = '!>=20.12.1-edge'
version = '2.1.2'
description = 'Perform a regex lookup on a column from the annotation matrices .obs or .var.\nThe annotation matrix can originate from either a modality, or all modalities (global .var or .obs).\n'
author = 'Dries Schaumont'
}
process.container = 'nextflow/bash:latest'
// detect tempdir
tempDir = java.nio.file.Paths.get(
System.getenv('NXF_TEMP') ?:
System.getenv('VIASH_TEMP') ?:
System.getenv('TEMPDIR') ?:
System.getenv('TMPDIR') ?:
'/tmp'
).toAbsolutePath()
profiles {
no_publish {
process {
withName: '.*' {
publishDir = [
enabled: false
]
}
}
}
mount_temp {
docker.temp = tempDir
podman.temp = tempDir
charliecloud.temp = tempDir
}
docker {
docker.enabled = true
// docker.userEmulation = true
singularity.enabled = false
podman.enabled = false
shifter.enabled = false
charliecloud.enabled = false
}
singularity {
singularity.enabled = true
singularity.autoMounts = true
docker.enabled = false
podman.enabled = false
shifter.enabled = false
charliecloud.enabled = false
}
podman {
podman.enabled = true
docker.enabled = false
singularity.enabled = false
shifter.enabled = false
charliecloud.enabled = false
}
shifter {
shifter.enabled = true
docker.enabled = false
singularity.enabled = false
podman.enabled = false
charliecloud.enabled = false
}
charliecloud {
charliecloud.enabled = true
docker.enabled = false
singularity.enabled = false
podman.enabled = false
shifter.enabled = false
}
}
process{
withLabel: mem1gb { memory = 1000000000.B }
withLabel: mem2gb { memory = 2000000000.B }
withLabel: mem5gb { memory = 5000000000.B }
withLabel: mem10gb { memory = 10000000000.B }
withLabel: mem20gb { memory = 20000000000.B }
withLabel: mem50gb { memory = 50000000000.B }
withLabel: mem100gb { memory = 100000000000.B }
withLabel: mem200gb { memory = 200000000000.B }
withLabel: mem500gb { memory = 500000000000.B }
withLabel: mem1tb { memory = 1000000000000.B }
withLabel: mem2tb { memory = 2000000000000.B }
withLabel: mem5tb { memory = 5000000000000.B }
withLabel: mem10tb { memory = 10000000000000.B }
withLabel: mem20tb { memory = 20000000000000.B }
withLabel: mem50tb { memory = 50000000000000.B }
withLabel: mem100tb { memory = 100000000000000.B }
withLabel: mem200tb { memory = 200000000000000.B }
withLabel: mem500tb { memory = 500000000000000.B }
withLabel: mem1gib { memory = 1073741824.B }
withLabel: mem2gib { memory = 2147483648.B }
withLabel: mem4gib { memory = 4294967296.B }
withLabel: mem8gib { memory = 8589934592.B }
withLabel: mem16gib { memory = 17179869184.B }
withLabel: mem32gib { memory = 34359738368.B }
withLabel: mem64gib { memory = 68719476736.B }
withLabel: mem128gib { memory = 137438953472.B }
withLabel: mem256gib { memory = 274877906944.B }
withLabel: mem512gib { memory = 549755813888.B }
withLabel: mem1tib { memory = 1099511627776.B }
withLabel: mem2tib { memory = 2199023255552.B }
withLabel: mem4tib { memory = 4398046511104.B }
withLabel: mem8tib { memory = 8796093022208.B }
withLabel: mem16tib { memory = 17592186044416.B }
withLabel: mem32tib { memory = 35184372088832.B }
withLabel: mem64tib { memory = 70368744177664.B }
withLabel: mem128tib { memory = 140737488355328.B }
withLabel: mem256tib { memory = 281474976710656.B }
withLabel: mem512tib { memory = 562949953421312.B }
withLabel: cpu1 { cpus = 1 }
withLabel: cpu2 { cpus = 2 }
withLabel: cpu5 { cpus = 5 }
withLabel: cpu10 { cpus = 10 }
withLabel: cpu20 { cpus = 20 }
withLabel: cpu50 { cpus = 50 }
withLabel: cpu100 { cpus = 100 }
withLabel: cpu200 { cpus = 200 }
withLabel: cpu500 { cpus = 500 }
withLabel: cpu1000 { cpus = 1000 }
}
includeConfig("nextflow_labels.config")

View File

@@ -0,0 +1,66 @@
process {
// Default resources for components that hardly do any processing
memory = { 2.GB * task.attempt }
cpus = 1
// Retry for exit codes that have something to do with memory issues
errorStrategy = { task.exitStatus in 137..140 ? 'retry' : 'terminate' }
maxRetries = 3
maxMemory = null
// CPU resources
withLabel: singlecpu { cpus = 1 }
withLabel: lowcpu { cpus = 4 }
withLabel: midcpu { cpus = 10 }
withLabel: highcpu { cpus = 20 }
// Memory resources
withLabel: lowmem { memory = { get_memory( 4.GB * task.attempt ) } }
withLabel: midmem { memory = { get_memory( 25.GB * task.attempt ) } }
withLabel: highmem { memory = { get_memory( 50.GB * task.attempt ) } }
withLabel: veryhighmem { memory = { get_memory( 75.GB * task.attempt ) } }
// Disk space
withLabel: lowdisk {
disk = {process.disk ? process.disk : null}
}
withLabel: middisk {
disk = {process.disk ? process.disk : null}
}
withLabel: highdisk {
disk = {process.disk ? process.disk : null}
}
withLabel: veryhighdisk {
disk = {process.disk ? process.disk : null}
}
// NOTE: The above labels intentionally do not have an effect by default.
// The user should set the disk space requirements by adding the following
// to the compute environment:
//
// withLabel: lowdisk { disk = { 20.GB * task.attempt } }
// withLabel: middisk { disk = { 100.GB * task.attempt } }
// withLabel: highdisk { disk = { 200.GB * task.attempt } }
// withLabel: veryhighdisk { disk = { 500.GB * task.attempt } }
}
def get_memory(to_compare) {
if (!process.containsKey("maxMemory") || !process.maxMemory) {
return to_compare
}
try {
if (process.containsKey("maxRetries") && process.maxRetries && task.attempt == (process.maxRetries as int)) {
return process.maxMemory
}
else if (to_compare.compareTo(process.maxMemory as nextflow.util.MemoryUnit) == 1) {
return max_memory as nextflow.util.MemoryUnit
}
else {
return to_compare
}
} catch (all) {
println "Error processing memory resources. Please check that process.maxMemory '${process.maxMemory}' and process.maxRetries '${process.maxRetries}' are valid!"
System.exit(1)
}
}

View File

@@ -0,0 +1,21 @@
# Inputs
input: # please fill in - example: "sample_path"
# input_column: "foo"
# input_layer: "foo"
modality: # please fill in - example: "rna"
# matrix: "var"
# Outputs
# output: "$id.$key.output.h5mu"
# output_compression: "gzip"
output_match_column: # please fill in - example: "foo"
# output_fraction_column: "foo"
# Query options
regex_pattern: # please fill in - example: "^[mM][tT]-"
# Nextflow input-output arguments
publish_dir: # please fill in - example: "output/"
# param_list: "my_params.yaml"
# Arguments

View File

@@ -0,0 +1,200 @@
{
"$schema": "http://json-schema.org/draft-07/schema",
"title": "grep_annotation_column",
"description": "Perform a regex lookup on a column from the annotation matrices .obs or .var.\nThe annotation matrix can originate from either a modality, or all modalities (global .var or .obs).\n",
"type": "object",
"definitions": {
"Dataset input": {
"title": "Dataset input",
"type": "object",
"description": "Dataset input using nf-tower \"dataset\" or \"data explorer\". Allows for the input of multiple parameter sets to initialise a Nextflow channel.",
"properties": {
"param_list": {
"description": "Dataset input can either be a list of maps, a csv file, a json file, a yaml file, or simply a yaml blob. The names of the input fields (e.g. csv columns, json keys) need to be an exact match with the workflow input parameters.",
"default": "",
"format": "file-path",
"mimetype": "text/csv",
"pattern": "^\\S+\\.csv$"
}
}
},
"inputs" : {
"title": "Inputs",
"type": "object",
"description": "Arguments related to the input dataset.",
"properties": {
"input": {
"type":
"string",
"description": "Type: `file`, required, example: `sample_path`. Path to the input ",
"help_text": "Type: `file`, required, example: `sample_path`. Path to the input .h5mu."
}
,
"input_column": {
"type":
"string",
"description": "Type: `string`. Column to query",
"help_text": "Type: `string`. Column to query. If not specified, use .var_names or .obs_names, depending on the value of --matrix"
}
,
"input_layer": {
"type":
"string",
"description": "Type: `string`. Input data to use when calculating fraction of observations that match with the query",
"help_text": "Type: `string`. Input data to use when calculating fraction of observations that match with the query. \nOnly used when --output_fraction_column is provided. If not specified, .X is used.\n"
}
,
"modality": {
"type":
"string",
"description": "Type: `string`, required, example: `rna`. Which modality to get the annotation matrix from",
"help_text": "Type: `string`, required, example: `rna`. Which modality to get the annotation matrix from.\n"
}
,
"matrix": {
"type":
"string",
"description": "Type: `string`, example: `var`, choices: ``var`, `obs``. Matrix to fetch the column from that will be searched",
"help_text": "Type: `string`, example: `var`, choices: ``var`, `obs``. Matrix to fetch the column from that will be searched.",
"enum": ["var", "obs"]
}
}
},
"outputs" : {
"title": "Outputs",
"type": "object",
"description": "Arguments related to how the output will be written.",
"properties": {
"output": {
"type":
"string",
"description": "Type: `file`, default: `$id.$key.output.h5mu`, example: `output.h5mu`. ",
"help_text": "Type: `file`, default: `$id.$key.output.h5mu`, example: `output.h5mu`. "
,
"default":"$id.$key.output.h5mu"
}
,
"output_compression": {
"type":
"string",
"description": "Type: `string`, example: `gzip`, choices: ``gzip`, `lzf``. The compression format to be used on the output h5mu object",
"help_text": "Type: `string`, example: `gzip`, choices: ``gzip`, `lzf``. The compression format to be used on the output h5mu object.",
"enum": ["gzip", "lzf"]
}
,
"output_match_column": {
"type":
"string",
"description": "Type: `string`, required. Name of the column to write the result to",
"help_text": "Type: `string`, required. Name of the column to write the result to."
}
,
"output_fraction_column": {
"type":
"string",
"description": "Type: `string`. For the opposite axis, name of the column to write the fraction of \nobservations that matches to the pattern",
"help_text": "Type: `string`. For the opposite axis, name of the column to write the fraction of \nobservations that matches to the pattern.\n"
}
}
},
"query options" : {
"title": "Query options",
"type": "object",
"description": "Options related to the query",
"properties": {
"regex_pattern": {
"type":
"string",
"description": "Type: `string`, required, example: `^[mM][tT]-`. Regex to use to match with the input column",
"help_text": "Type: `string`, required, example: `^[mM][tT]-`. Regex to use to match with the input column."
}
}
},
"nextflow input-output arguments" : {
"title": "Nextflow input-output arguments",
"type": "object",
"description": "Input/output parameters for Nextflow itself. Please note that both publishDir and publish_dir are supported but at least one has to be configured.",
"properties": {
"publish_dir": {
"type":
"string",
"description": "Type: `string`, required, example: `output/`. Path to an output directory",
"help_text": "Type: `string`, required, example: `output/`. Path to an output directory."
}
}
}
},
"allOf": [
{
"$ref": "#/definitions/inputs"
},
{
"$ref": "#/definitions/outputs"
},
{
"$ref": "#/definitions/query options"
},
{
"$ref": "#/definitions/nextflow input-output arguments"
}
]
}

View File

@@ -0,0 +1,12 @@
def setup_logger():
import logging
from sys import stdout
logger = logging.getLogger()
logger.setLevel(logging.INFO)
console_handler = logging.StreamHandler(stdout)
logFormatter = logging.Formatter("%(asctime)s %(levelname)-8s %(message)s")
console_handler.setFormatter(logFormatter)
logger.addHandler(console_handler)
return logger

View File

@@ -0,0 +1,376 @@
name: "calculate_qc_metrics"
namespace: "qc"
version: "2.1.2"
authors:
- name: "Dries Schaumont"
roles:
- "author"
info:
role: "Core Team Member"
links:
email: "dries@data-intuitive.com"
github: "DriesSchaumont"
orcid: "0000-0002-4389-0440"
linkedin: "dries-schaumont"
organizations:
- name: "Data Intuitive"
href: "https://www.data-intuitive.com"
role: "Data Scientist"
argument_groups:
- name: "Inputs"
arguments:
- type: "file"
name: "--input"
description: "Input h5mu file"
info: null
example:
- "input.h5mu"
must_exist: true
create_parent: true
required: true
direction: "input"
multiple: false
multiple_sep: ";"
- type: "string"
name: "--modality"
info: null
default:
- "rna"
required: false
direction: "input"
multiple: false
multiple_sep: ";"
- type: "string"
name: "--layer"
info: null
example:
- "raw_counts"
required: false
direction: "input"
multiple: false
multiple_sep: ";"
- name: "Metrics added to .obs"
arguments:
- type: "string"
name: "--var_qc_metrics"
description: "Keys to select a boolean (containing only True or False) column\
\ from .var.\nFor each cell, calculate the proportion of total values for genes\
\ which are labeled 'True', \ncompared to the total sum of the values for all\
\ genes.\n"
info: null
example:
- "ercc,highly_variable,mitochondrial"
required: false
direction: "input"
multiple: true
multiple_sep: ";"
- type: "boolean"
name: "--var_qc_metrics_fill_na_value"
description: "Fill any 'NA' values found in the columns specified with --var_qc_metrics\
\ to 'True' or 'False'.\nas False.\n"
info: null
required: false
direction: "input"
multiple: false
multiple_sep: ";"
- type: "integer"
name: "--top_n_vars"
description: "Number of top vars to be used to calculate cumulative proportions.\n\
If not specified, proportions are not calculated. `--top_n_vars 20;50` finds\n\
cumulative proportion to the 20th and 50th most expressed vars.\n"
info: null
required: false
direction: "input"
multiple: true
multiple_sep: ";"
- type: "string"
name: "--output_obs_num_nonzero_vars"
description: "Name of column in .obs describing, for each observation, the number\
\ of stored values\n(including explicit zeroes). In other words, the name of\
\ the column that counts\nfor each row the number of columns that contain data.\n"
info: null
default:
- "num_nonzero_vars"
required: false
direction: "input"
multiple: false
multiple_sep: ";"
- type: "string"
name: "--output_obs_total_counts_vars"
description: "Name of the column for .obs describing, for each observation (row),\n\
the sum of the stored values in the columns.\n"
info: null
default:
- "total_counts"
required: false
direction: "input"
multiple: false
multiple_sep: ";"
- name: "Metrics added to .var"
arguments:
- type: "string"
name: "--output_var_num_nonzero_obs"
description: "Name of column describing, for each feature, the number of stored\
\ values\n(including explicit zeroes). In other words, the name of the column\
\ that counts\nfor each column the number of rows that contain data.\n"
info: null
default:
- "num_nonzero_obs"
required: false
direction: "input"
multiple: false
multiple_sep: ";"
- type: "string"
name: "--output_var_total_counts_obs"
description: "Name of the column in .var describing, for each feature (column),\n\
the sum of the stored values in the rows.\n"
info: null
default:
- "total_counts"
required: false
direction: "input"
multiple: false
multiple_sep: ";"
- type: "string"
name: "--output_var_obs_mean"
description: "Name of the column in .obs providing the mean of the values in each\
\ row.\n"
info: null
default:
- "obs_mean"
required: false
direction: "input"
multiple: false
multiple_sep: ";"
- type: "string"
name: "--output_var_pct_dropout"
description: "Name of the column in .obs providing for each feature the percentage\
\ of\nobservations the feature does not appear on (i.e. is missing). Same as\
\ `--num_nonzero_obs`\nbut percentage based.\n"
info: null
default:
- "pct_dropout"
required: false
direction: "input"
multiple: false
multiple_sep: ";"
- name: "Outputs"
arguments:
- type: "file"
name: "--output"
description: "Output h5mu file."
info: null
example:
- "output.h5mu"
must_exist: true
create_parent: true
required: false
direction: "output"
multiple: false
multiple_sep: ";"
- type: "string"
name: "--output_compression"
description: "The compression format to be used on the output h5mu object."
info: null
example:
- "gzip"
required: false
choices:
- "gzip"
- "lzf"
direction: "input"
multiple: false
multiple_sep: ";"
resources:
- type: "python_script"
path: "script.py"
is_executable: true
- type: "file"
path: "setup_logger.py"
- type: "file"
path: "compress_h5mu.py"
- type: "file"
path: "nextflow_labels.config"
dest: "nextflow_labels.config"
description: "Add basic quality control metrics to an .h5mu file.\n\nThe metrics are\
\ comparable to what scanpy.pp.calculate_qc_metrics output,\nalthough they have\
\ slightly different names:\n\nVar metrics (name in this component -> name in scanpy):\n\
\ - pct_dropout -> pct_dropout_by_{expr_type}\n - num_nonzero_obs -> n_cells_by_{expr_type}\n\
\ - obs_mean -> mean_{expr_type}\n - total_counts -> total_{expr_type}\n\n Obs\
\ metrics:\n - num_nonzero_vars -> n_genes_by_{expr_type}\n - pct_{var_qc_metrics}\
\ -> pct_{expr_type}_{qc_var}\n - total_counts_{var_qc_metrics} -> total_{expr_type}_{qc_var}\n\
\ - pct_of_counts_in_top_{top_n_vars}_vars -> pct_{expr_type}_in_top_{n}_{var_type}\n\
\ - total_counts -> total_{expr_type}\n \n"
test_resources:
- type: "python_script"
path: "test.py"
is_executable: true
- type: "file"
path: "pbmc_1k_protein_v3_filtered_feature_bc_matrix.h5mu"
info: null
status: "enabled"
scope:
image: "public"
target: "public"
license: "MIT"
links:
repository: "https://github.com/openpipelines-bio/openpipeline"
docker_registry: "ghcr.io"
runners:
- type: "executable"
id: "executable"
docker_setup_strategy: "ifneedbepullelsecachedbuild"
- type: "nextflow"
id: "nextflow"
directives:
label:
- "singlecpu"
- "midmem"
tag: "$id"
auto:
simplifyInput: true
simplifyOutput: false
transcript: false
publish: false
config:
labels:
mem1gb: "memory = 1000000000.B"
mem2gb: "memory = 2000000000.B"
mem5gb: "memory = 5000000000.B"
mem10gb: "memory = 10000000000.B"
mem20gb: "memory = 20000000000.B"
mem50gb: "memory = 50000000000.B"
mem100gb: "memory = 100000000000.B"
mem200gb: "memory = 200000000000.B"
mem500gb: "memory = 500000000000.B"
mem1tb: "memory = 1000000000000.B"
mem2tb: "memory = 2000000000000.B"
mem5tb: "memory = 5000000000000.B"
mem10tb: "memory = 10000000000000.B"
mem20tb: "memory = 20000000000000.B"
mem50tb: "memory = 50000000000000.B"
mem100tb: "memory = 100000000000000.B"
mem200tb: "memory = 200000000000000.B"
mem500tb: "memory = 500000000000000.B"
mem1gib: "memory = 1073741824.B"
mem2gib: "memory = 2147483648.B"
mem4gib: "memory = 4294967296.B"
mem8gib: "memory = 8589934592.B"
mem16gib: "memory = 17179869184.B"
mem32gib: "memory = 34359738368.B"
mem64gib: "memory = 68719476736.B"
mem128gib: "memory = 137438953472.B"
mem256gib: "memory = 274877906944.B"
mem512gib: "memory = 549755813888.B"
mem1tib: "memory = 1099511627776.B"
mem2tib: "memory = 2199023255552.B"
mem4tib: "memory = 4398046511104.B"
mem8tib: "memory = 8796093022208.B"
mem16tib: "memory = 17592186044416.B"
mem32tib: "memory = 35184372088832.B"
mem64tib: "memory = 70368744177664.B"
mem128tib: "memory = 140737488355328.B"
mem256tib: "memory = 281474976710656.B"
mem512tib: "memory = 562949953421312.B"
cpu1: "cpus = 1"
cpu2: "cpus = 2"
cpu5: "cpus = 5"
cpu10: "cpus = 10"
cpu20: "cpus = 20"
cpu50: "cpus = 50"
cpu100: "cpus = 100"
cpu200: "cpus = 200"
cpu500: "cpus = 500"
cpu1000: "cpus = 1000"
script:
- "includeConfig(\"nextflow_labels.config\")"
debug: false
container: "docker"
engines:
- type: "docker"
id: "docker"
image: "python:3.11-slim"
target_tag: "2.1.0"
namespace_separator: "/"
setup:
- type: "apt"
packages:
- "procps"
interactive: false
- type: "python"
user: false
packages:
- "anndata~=0.11.1"
- "mudata~=0.3.1"
- "scipy"
script:
- "exec(\"try:\\n import awkward\\nexcept ModuleNotFoundError:\\n exit(0)\\\
nelse: exit(1)\")"
upgrade: true
test_setup:
- type: "apt"
packages:
- "git"
interactive: false
- type: "python"
user: false
packages:
- "viashpy==0.8.0"
github:
- "openpipelines-bio/core#subdirectory=packages/python/openpipeline_testutils"
upgrade: true
- type: "python"
user: false
packages:
- "scanpy"
upgrade: true
entrypoint: []
cmd: null
build_info:
config: "src/qc/calculate_qc_metrics/config.vsh.yaml"
runner: "nextflow"
engine: "docker"
output: "target/nextflow/qc/calculate_qc_metrics"
executable: "target/nextflow/qc/calculate_qc_metrics/main.nf"
viash_version: "0.9.4"
git_commit: "a0c9522486585774f76416150f8a3291409b5363"
git_remote: "https://github.com/openpipelines-bio/openpipeline"
git_tag: "2.1.1-2-ga0c95224865"
package_config:
name: "openpipeline"
summary: "Best-practice workflows for single-cell multi-omics analyses.\n"
description: "OpenPipelines are extensible single cell analysis pipelines for reproducible\
\ and large-scale single cell processing using [Viash](https://viash.io) and [Nextflow](https://www.nextflow.io/).\n\
\nIn terms of workflows, the following has been made available, but keep in mind\
\ that\nindividual tools and functionality can be executed as standalone components\
\ as well.\n\n * Demultiplexing: conversion of raw sequencing data to FASTQ objects.\n\
\ * Ingestion: Read mapping and generating a count matrix.\n * Single sample\
\ processing: cell filtering and doublet detection.\n * Multisample processing:\
\ Count transformation, normalization, QC metric calulations.\n * Integration:\
\ Clustering, integration and batch correction using single and multimodal methods.\n\
\ * Downstream analysis workflows\n"
info:
test_resources:
- type: "s3"
path: "s3://openpipelines-data"
dest: "resources_test"
viash_version: "0.9.4"
source: "src"
target: "target"
config_mods:
- ".resources += {path: '/src/workflows/utils/labels.config', dest: 'nextflow_labels.config'}\n\
.runners[.type == 'nextflow'].config.script := 'includeConfig(\"nextflow_labels.config\"\
)'"
- ".version := \"2.1.2\""
- ".engines[.type == 'docker'].target_tag := '2.1.0'"
keywords:
- "single-cell"
- "multimodal"
license: "MIT"
organization: "openpipelines-bio"
links:
repository: "https://github.com/openpipelines-bio/openpipeline"
docker_registry: "ghcr.io"
homepage: "https://openpipelines.bio"
documentation: "https://openpipelines.bio/fundamentals"
issue_tracker: "https://github.com/openpipelines-bio/openpipeline/issues"

View File

@@ -0,0 +1,87 @@
import shutil
from anndata import AnnData
from mudata import write_h5ad
from h5py import File as H5File
from h5py import Group, Dataset
from pathlib import Path
from typing import Union, Literal
from functools import partial
def compress_h5mu(
input_path: Union[str, Path],
output_path: Union[str, Path],
compression: Union[Literal["gzip"], Literal["lzf"]],
):
input_path, output_path = str(input_path), str(output_path)
def copy_attributes(in_object, out_object):
for key, value in in_object.attrs.items():
out_object.attrs[key] = value
def visit_path(
output_h5: H5File,
compression: Union[Literal["gzip"], Literal["lzf"]],
name: str,
object: Union[Group, Dataset],
):
if isinstance(object, Group):
new_group = output_h5.create_group(name)
copy_attributes(object, new_group)
elif isinstance(object, Dataset):
# Compression only works for non-scalar Dataset objects
# Scalar objects dont have a shape defined
if not object.compression and object.shape not in [None, ()]:
new_dataset = output_h5.create_dataset(
name, data=object, compression=compression
)
copy_attributes(object, new_dataset)
else:
output_h5.copy(object, name)
else:
raise NotImplementedError(
f"Could not copy element {name}, "
f"type has not been implemented yet: {type(object)}"
)
with (
H5File(input_path, "r") as input_h5,
H5File(output_path, "w", userblock_size=512) as output_h5,
):
copy_attributes(input_h5, output_h5)
input_h5.visititems(partial(visit_path, output_h5, compression))
with open(input_path, "rb") as input_bytes:
# Mudata puts metadata like this in the first 512 bytes:
# MuData (format-version=0.1.0;creator=muon;creator-version=0.2.0)
# See mudata/_core/io.py, read_h5mu() function
starting_metadata = input_bytes.read(100)
# The metadata is padded with extra null bytes up until 512 bytes
truncate_location = starting_metadata.find(b"\x00")
starting_metadata = starting_metadata[:truncate_location]
with open(output_path, "br+") as f:
nbytes = f.write(starting_metadata)
f.write(b"\0" * (512 - nbytes))
def write_h5ad_to_h5mu_with_compression(
output_file: Union[str, Path],
h5mu: Union[str, Path],
modality_name: str,
modality_data: AnnData,
output_compression=None,
):
output_file = Path(output_file)
h5mu = Path(h5mu)
output_file_uncompressed = (
output_file.with_name(output_file.stem + "_uncompressed.h5mu")
if output_compression
else output_file
)
shutil.copyfile(h5mu, output_file_uncompressed)
write_h5ad(filename=output_file_uncompressed, mod=modality_name, data=modality_data)
if output_compression:
compress_h5mu(
output_file_uncompressed, output_file, compression=output_compression
)
output_file_uncompressed.unlink()

View File

@@ -0,0 +1,126 @@
manifest {
name = 'qc/calculate_qc_metrics'
mainScript = 'main.nf'
nextflowVersion = '!>=20.12.1-edge'
version = '2.1.2'
description = 'Add basic quality control metrics to an .h5mu file.\n\nThe metrics are comparable to what scanpy.pp.calculate_qc_metrics output,\nalthough they have slightly different names:\n\nVar metrics (name in this component -> name in scanpy):\n - pct_dropout -> pct_dropout_by_{expr_type}\n - num_nonzero_obs -> n_cells_by_{expr_type}\n - obs_mean -> mean_{expr_type}\n - total_counts -> total_{expr_type}\n\n Obs metrics:\n - num_nonzero_vars -> n_genes_by_{expr_type}\n - pct_{var_qc_metrics} -> pct_{expr_type}_{qc_var}\n - total_counts_{var_qc_metrics} -> total_{expr_type}_{qc_var}\n - pct_of_counts_in_top_{top_n_vars}_vars -> pct_{expr_type}_in_top_{n}_{var_type}\n - total_counts -> total_{expr_type}\n \n'
author = 'Dries Schaumont'
}
process.container = 'nextflow/bash:latest'
// detect tempdir
tempDir = java.nio.file.Paths.get(
System.getenv('NXF_TEMP') ?:
System.getenv('VIASH_TEMP') ?:
System.getenv('TEMPDIR') ?:
System.getenv('TMPDIR') ?:
'/tmp'
).toAbsolutePath()
profiles {
no_publish {
process {
withName: '.*' {
publishDir = [
enabled: false
]
}
}
}
mount_temp {
docker.temp = tempDir
podman.temp = tempDir
charliecloud.temp = tempDir
}
docker {
docker.enabled = true
// docker.userEmulation = true
singularity.enabled = false
podman.enabled = false
shifter.enabled = false
charliecloud.enabled = false
}
singularity {
singularity.enabled = true
singularity.autoMounts = true
docker.enabled = false
podman.enabled = false
shifter.enabled = false
charliecloud.enabled = false
}
podman {
podman.enabled = true
docker.enabled = false
singularity.enabled = false
shifter.enabled = false
charliecloud.enabled = false
}
shifter {
shifter.enabled = true
docker.enabled = false
singularity.enabled = false
podman.enabled = false
charliecloud.enabled = false
}
charliecloud {
charliecloud.enabled = true
docker.enabled = false
singularity.enabled = false
podman.enabled = false
shifter.enabled = false
}
}
process{
withLabel: mem1gb { memory = 1000000000.B }
withLabel: mem2gb { memory = 2000000000.B }
withLabel: mem5gb { memory = 5000000000.B }
withLabel: mem10gb { memory = 10000000000.B }
withLabel: mem20gb { memory = 20000000000.B }
withLabel: mem50gb { memory = 50000000000.B }
withLabel: mem100gb { memory = 100000000000.B }
withLabel: mem200gb { memory = 200000000000.B }
withLabel: mem500gb { memory = 500000000000.B }
withLabel: mem1tb { memory = 1000000000000.B }
withLabel: mem2tb { memory = 2000000000000.B }
withLabel: mem5tb { memory = 5000000000000.B }
withLabel: mem10tb { memory = 10000000000000.B }
withLabel: mem20tb { memory = 20000000000000.B }
withLabel: mem50tb { memory = 50000000000000.B }
withLabel: mem100tb { memory = 100000000000000.B }
withLabel: mem200tb { memory = 200000000000000.B }
withLabel: mem500tb { memory = 500000000000000.B }
withLabel: mem1gib { memory = 1073741824.B }
withLabel: mem2gib { memory = 2147483648.B }
withLabel: mem4gib { memory = 4294967296.B }
withLabel: mem8gib { memory = 8589934592.B }
withLabel: mem16gib { memory = 17179869184.B }
withLabel: mem32gib { memory = 34359738368.B }
withLabel: mem64gib { memory = 68719476736.B }
withLabel: mem128gib { memory = 137438953472.B }
withLabel: mem256gib { memory = 274877906944.B }
withLabel: mem512gib { memory = 549755813888.B }
withLabel: mem1tib { memory = 1099511627776.B }
withLabel: mem2tib { memory = 2199023255552.B }
withLabel: mem4tib { memory = 4398046511104.B }
withLabel: mem8tib { memory = 8796093022208.B }
withLabel: mem16tib { memory = 17592186044416.B }
withLabel: mem32tib { memory = 35184372088832.B }
withLabel: mem64tib { memory = 70368744177664.B }
withLabel: mem128tib { memory = 140737488355328.B }
withLabel: mem256tib { memory = 281474976710656.B }
withLabel: mem512tib { memory = 562949953421312.B }
withLabel: cpu1 { cpus = 1 }
withLabel: cpu2 { cpus = 2 }
withLabel: cpu5 { cpus = 5 }
withLabel: cpu10 { cpus = 10 }
withLabel: cpu20 { cpus = 20 }
withLabel: cpu50 { cpus = 50 }
withLabel: cpu100 { cpus = 100 }
withLabel: cpu200 { cpus = 200 }
withLabel: cpu500 { cpus = 500 }
withLabel: cpu1000 { cpus = 1000 }
}
includeConfig("nextflow_labels.config")

View File

@@ -0,0 +1,66 @@
process {
// Default resources for components that hardly do any processing
memory = { 2.GB * task.attempt }
cpus = 1
// Retry for exit codes that have something to do with memory issues
errorStrategy = { task.exitStatus in 137..140 ? 'retry' : 'terminate' }
maxRetries = 3
maxMemory = null
// CPU resources
withLabel: singlecpu { cpus = 1 }
withLabel: lowcpu { cpus = 4 }
withLabel: midcpu { cpus = 10 }
withLabel: highcpu { cpus = 20 }
// Memory resources
withLabel: lowmem { memory = { get_memory( 4.GB * task.attempt ) } }
withLabel: midmem { memory = { get_memory( 25.GB * task.attempt ) } }
withLabel: highmem { memory = { get_memory( 50.GB * task.attempt ) } }
withLabel: veryhighmem { memory = { get_memory( 75.GB * task.attempt ) } }
// Disk space
withLabel: lowdisk {
disk = {process.disk ? process.disk : null}
}
withLabel: middisk {
disk = {process.disk ? process.disk : null}
}
withLabel: highdisk {
disk = {process.disk ? process.disk : null}
}
withLabel: veryhighdisk {
disk = {process.disk ? process.disk : null}
}
// NOTE: The above labels intentionally do not have an effect by default.
// The user should set the disk space requirements by adding the following
// to the compute environment:
//
// withLabel: lowdisk { disk = { 20.GB * task.attempt } }
// withLabel: middisk { disk = { 100.GB * task.attempt } }
// withLabel: highdisk { disk = { 200.GB * task.attempt } }
// withLabel: veryhighdisk { disk = { 500.GB * task.attempt } }
}
def get_memory(to_compare) {
if (!process.containsKey("maxMemory") || !process.maxMemory) {
return to_compare
}
try {
if (process.containsKey("maxRetries") && process.maxRetries && task.attempt == (process.maxRetries as int)) {
return process.maxMemory
}
else if (to_compare.compareTo(process.maxMemory as nextflow.util.MemoryUnit) == 1) {
return max_memory as nextflow.util.MemoryUnit
}
else {
return to_compare
}
} catch (all) {
println "Error processing memory resources. Please check that process.maxMemory '${process.maxMemory}' and process.maxRetries '${process.maxRetries}' are valid!"
System.exit(1)
}
}

View File

@@ -0,0 +1,27 @@
# Inputs
input: # please fill in - example: "input.h5mu"
modality: "rna"
# layer: "raw_counts"
# Metrics added to .obs
# var_qc_metrics: ["ercc,highly_variable,mitochondrial"]
# var_qc_metrics_fill_na_value: true
# top_n_vars: [123]
output_obs_num_nonzero_vars: "num_nonzero_vars"
output_obs_total_counts_vars: "total_counts"
# Metrics added to .var
output_var_num_nonzero_obs: "num_nonzero_obs"
output_var_total_counts_obs: "total_counts"
output_var_obs_mean: "obs_mean"
output_var_pct_dropout: "pct_dropout"
# Outputs
# output: "$id.$key.output.h5mu"
# output_compression: "gzip"
# Nextflow input-output arguments
publish_dir: # please fill in - example: "output/"
# param_list: "my_params.yaml"
# Arguments

View File

@@ -0,0 +1,259 @@
{
"$schema": "http://json-schema.org/draft-07/schema",
"title": "calculate_qc_metrics",
"description": "Add basic quality control metrics to an .h5mu file.\n\nThe metrics are comparable to what scanpy.pp.calculate_qc_metrics output,\nalthough they have slightly different names:\n\nVar metrics (name in this component -\u003e name in scanpy):\n - pct_dropout -\u003e pct_dropout_by_{expr_type}\n - num_nonzero_obs -\u003e n_cells_by_{expr_type}\n - obs_mean -\u003e mean_{expr_type}\n - total_counts -\u003e total_{expr_type}\n\n Obs metrics:\n - num_nonzero_vars -\u003e n_genes_by_{expr_type}\n - pct_{var_qc_metrics} -\u003e pct_{expr_type}_{qc_var}\n - total_counts_{var_qc_metrics} -\u003e total_{expr_type}_{qc_var}\n - pct_of_counts_in_top_{top_n_vars}_vars -\u003e pct_{expr_type}_in_top_{n}_{var_type}\n - total_counts -\u003e total_{expr_type}\n \n",
"type": "object",
"definitions": {
"Dataset input": {
"title": "Dataset input",
"type": "object",
"description": "Dataset input using nf-tower \"dataset\" or \"data explorer\". Allows for the input of multiple parameter sets to initialise a Nextflow channel.",
"properties": {
"param_list": {
"description": "Dataset input can either be a list of maps, a csv file, a json file, a yaml file, or simply a yaml blob. The names of the input fields (e.g. csv columns, json keys) need to be an exact match with the workflow input parameters.",
"default": "",
"format": "file-path",
"mimetype": "text/csv",
"pattern": "^\\S+\\.csv$"
}
}
},
"inputs" : {
"title": "Inputs",
"type": "object",
"description": "No description",
"properties": {
"input": {
"type":
"string",
"description": "Type: `file`, required, example: `input.h5mu`. Input h5mu file",
"help_text": "Type: `file`, required, example: `input.h5mu`. Input h5mu file"
}
,
"modality": {
"type":
"string",
"description": "Type: `string`, default: `rna`. ",
"help_text": "Type: `string`, default: `rna`. "
,
"default":"rna"
}
,
"layer": {
"type":
"string",
"description": "Type: `string`, example: `raw_counts`. ",
"help_text": "Type: `string`, example: `raw_counts`. "
}
}
},
"outputs" : {
"title": "Outputs",
"type": "object",
"description": "No description",
"properties": {
"output": {
"type":
"string",
"description": "Type: `file`, default: `$id.$key.output.h5mu`, example: `output.h5mu`. Output h5mu file",
"help_text": "Type: `file`, default: `$id.$key.output.h5mu`, example: `output.h5mu`. Output h5mu file."
,
"default":"$id.$key.output.h5mu"
}
,
"output_compression": {
"type":
"string",
"description": "Type: `string`, example: `gzip`, choices: ``gzip`, `lzf``. The compression format to be used on the output h5mu object",
"help_text": "Type: `string`, example: `gzip`, choices: ``gzip`, `lzf``. The compression format to be used on the output h5mu object.",
"enum": ["gzip", "lzf"]
}
}
},
"metrics added to .obs" : {
"title": "Metrics added to .obs",
"type": "object",
"description": "No description",
"properties": {
"var_qc_metrics": {
"type":
"string",
"description": "Type: List of `string`, example: `ercc,highly_variable,mitochondrial`, multiple_sep: `\";\"`. Keys to select a boolean (containing only True or False) column from ",
"help_text": "Type: List of `string`, example: `ercc,highly_variable,mitochondrial`, multiple_sep: `\";\"`. Keys to select a boolean (containing only True or False) column from .var.\nFor each cell, calculate the proportion of total values for genes which are labeled \u0027True\u0027, \ncompared to the total sum of the values for all genes.\n"
}
,
"var_qc_metrics_fill_na_value": {
"type":
"boolean",
"description": "Type: `boolean`. Fill any \u0027NA\u0027 values found in the columns specified with --var_qc_metrics to \u0027True\u0027 or \u0027False\u0027",
"help_text": "Type: `boolean`. Fill any \u0027NA\u0027 values found in the columns specified with --var_qc_metrics to \u0027True\u0027 or \u0027False\u0027.\nas False.\n"
}
,
"top_n_vars": {
"type":
"string",
"description": "Type: List of `integer`, multiple_sep: `\";\"`. Number of top vars to be used to calculate cumulative proportions",
"help_text": "Type: List of `integer`, multiple_sep: `\";\"`. Number of top vars to be used to calculate cumulative proportions.\nIf not specified, proportions are not calculated. `--top_n_vars 20;50` finds\ncumulative proportion to the 20th and 50th most expressed vars.\n"
}
,
"output_obs_num_nonzero_vars": {
"type":
"string",
"description": "Type: `string`, default: `num_nonzero_vars`. Name of column in ",
"help_text": "Type: `string`, default: `num_nonzero_vars`. Name of column in .obs describing, for each observation, the number of stored values\n(including explicit zeroes). In other words, the name of the column that counts\nfor each row the number of columns that contain data.\n"
,
"default":"num_nonzero_vars"
}
,
"output_obs_total_counts_vars": {
"type":
"string",
"description": "Type: `string`, default: `total_counts`. Name of the column for ",
"help_text": "Type: `string`, default: `total_counts`. Name of the column for .obs describing, for each observation (row),\nthe sum of the stored values in the columns.\n"
,
"default":"total_counts"
}
}
},
"metrics added to .var" : {
"title": "Metrics added to .var",
"type": "object",
"description": "No description",
"properties": {
"output_var_num_nonzero_obs": {
"type":
"string",
"description": "Type: `string`, default: `num_nonzero_obs`. Name of column describing, for each feature, the number of stored values\n(including explicit zeroes)",
"help_text": "Type: `string`, default: `num_nonzero_obs`. Name of column describing, for each feature, the number of stored values\n(including explicit zeroes). In other words, the name of the column that counts\nfor each column the number of rows that contain data.\n"
,
"default":"num_nonzero_obs"
}
,
"output_var_total_counts_obs": {
"type":
"string",
"description": "Type: `string`, default: `total_counts`. Name of the column in ",
"help_text": "Type: `string`, default: `total_counts`. Name of the column in .var describing, for each feature (column),\nthe sum of the stored values in the rows.\n"
,
"default":"total_counts"
}
,
"output_var_obs_mean": {
"type":
"string",
"description": "Type: `string`, default: `obs_mean`. Name of the column in ",
"help_text": "Type: `string`, default: `obs_mean`. Name of the column in .obs providing the mean of the values in each row.\n"
,
"default":"obs_mean"
}
,
"output_var_pct_dropout": {
"type":
"string",
"description": "Type: `string`, default: `pct_dropout`. Name of the column in ",
"help_text": "Type: `string`, default: `pct_dropout`. Name of the column in .obs providing for each feature the percentage of\nobservations the feature does not appear on (i.e. is missing). Same as `--num_nonzero_obs`\nbut percentage based.\n"
,
"default":"pct_dropout"
}
}
},
"nextflow input-output arguments" : {
"title": "Nextflow input-output arguments",
"type": "object",
"description": "Input/output parameters for Nextflow itself. Please note that both publishDir and publish_dir are supported but at least one has to be configured.",
"properties": {
"publish_dir": {
"type":
"string",
"description": "Type: `string`, required, example: `output/`. Path to an output directory",
"help_text": "Type: `string`, required, example: `output/`. Path to an output directory."
}
}
}
},
"allOf": [
{
"$ref": "#/definitions/inputs"
},
{
"$ref": "#/definitions/outputs"
},
{
"$ref": "#/definitions/metrics added to .obs"
},
{
"$ref": "#/definitions/metrics added to .var"
},
{
"$ref": "#/definitions/nextflow input-output arguments"
}
]
}

View File

@@ -0,0 +1,12 @@
def setup_logger():
import logging
from sys import stdout
logger = logging.getLogger()
logger.setLevel(logging.INFO)
console_handler = logging.StreamHandler(stdout)
logFormatter = logging.Formatter("%(asctime)s %(levelname)-8s %(message)s")
console_handler.setFormatter(logFormatter)
logger.addHandler(console_handler)
return logger

View File

@@ -0,0 +1,406 @@
name: "qc"
namespace: "workflows/qc"
version: "2.1.2"
authors:
- name: "Dries Schaumont"
roles:
- "author"
- "maintainer"
info:
role: "Core Team Member"
links:
email: "dries@data-intuitive.com"
github: "DriesSchaumont"
orcid: "0000-0002-4389-0440"
linkedin: "dries-schaumont"
organizations:
- name: "Data Intuitive"
href: "https://www.data-intuitive.com"
role: "Data Scientist"
argument_groups:
- name: "Inputs"
arguments:
- type: "string"
name: "--id"
description: "ID of the sample."
info: null
example:
- "foo"
required: true
direction: "input"
multiple: false
multiple_sep: ";"
- type: "file"
name: "--input"
alternatives:
- "-i"
description: "Path to the sample."
info: null
example:
- "input.h5mu"
must_exist: true
create_parent: true
required: true
direction: "input"
multiple: false
multiple_sep: ";"
- type: "string"
name: "--modality"
description: "Which modality to process."
info: null
default:
- "rna"
required: false
direction: "input"
multiple: false
multiple_sep: ";"
- type: "string"
name: "--layer"
description: "Layer to calculate qc metrics for."
info: null
example:
- "raw_counts"
required: false
direction: "input"
multiple: false
multiple_sep: ";"
- name: "Mitochondrial & Ribosomal Gene Detection"
arguments:
- type: "string"
name: "--var_gene_names"
description: ".var column name to be used to detect mitochondrial/ribosomal genes\
\ instead of .var_names (default if not set).\nGene names matching with the\
\ regex value from --mitochondrial_gene_regex or --ribosomal_gene_regex will\
\ be \nidentified as mitochondrial or ribosomal genes, respectively.\n"
info: null
example:
- "gene_symbol"
required: false
direction: "input"
multiple: false
multiple_sep: ";"
- type: "string"
name: "--var_name_mitochondrial_genes"
description: "In which .var slot to store a boolean array corresponding the mitochondrial\
\ genes.\n"
info: null
required: false
direction: "input"
multiple: false
multiple_sep: ";"
- type: "string"
name: "--obs_name_mitochondrial_fraction"
description: ".Obs slot to store the fraction of reads found to be mitochondrial.\
\ Defaults to 'fraction_' suffixed by the value of --var_name_mitochondrial_genes\n"
info: null
required: false
direction: "input"
multiple: false
multiple_sep: ";"
- type: "string"
name: "--mitochondrial_gene_regex"
description: "Regex string that identifies mitochondrial genes from --var_gene_names.\n\
By default will detect human and mouse mitochondrial genes from a gene symbol.\n"
info: null
default:
- "^[mM][tT]-"
required: false
direction: "input"
multiple: false
multiple_sep: ";"
- type: "string"
name: "--var_name_ribosomal_genes"
description: "In which .var slot to store a boolean array corresponding the ribosomal\
\ genes.\n"
info: null
required: false
direction: "input"
multiple: false
multiple_sep: ";"
- type: "string"
name: "--obs_name_ribosomal_fraction"
description: "When specified, write the fraction of counts originating from ribosomal\
\ genes \n(based on --ribosomal_gene_regex) to an .obs column with the specified\
\ name.\nRequires --var_name_ribosomal_genes.\n"
info: null
required: false
direction: "input"
multiple: false
multiple_sep: ";"
- type: "string"
name: "--ribosomal_gene_regex"
description: "Regex string that identifies ribosomal genes from --var_gene_names.\n\
By default will detect human and mouse ribosomal genes from a gene symbol.\n"
info: null
default:
- "^[Mm]?[Rr][Pp][LlSs]"
required: false
direction: "input"
multiple: false
multiple_sep: ";"
- name: "QC metrics calculation options"
arguments:
- type: "string"
name: "--var_qc_metrics"
description: "Keys to select a boolean (containing only True or False) column\
\ from .var.\nFor each cell, calculate the proportion of total values for genes\
\ which are labeled 'True', \ncompared to the total sum of the values for all\
\ genes. Defaults to the value from\n--var_name_mitochondrial_genes.\n"
info: null
example:
- "ercc,highly_variable"
required: false
direction: "input"
multiple: true
multiple_sep: ","
- type: "integer"
name: "--top_n_vars"
description: "Number of top vars to be used to calculate cumulative proportions.\n\
If not specified, proportions are not calculated. `--top_n_vars 20,50` finds\n\
cumulative proportion to the 20th and 50th most expressed vars.\n"
info: null
default:
- 50
- 100
- 200
- 500
required: false
direction: "input"
multiple: true
multiple_sep: ","
- type: "string"
name: "--output_obs_num_nonzero_vars"
description: "Name of column in .obs describing, for each observation, the number\
\ of stored values\n(including explicit zeroes). In other words, the name of\
\ the column that counts\nfor each row the number of columns that contain data.\n"
info: null
default:
- "num_nonzero_vars"
required: false
direction: "input"
multiple: false
multiple_sep: ";"
- type: "string"
name: "--output_obs_total_counts_vars"
description: "Name of the column for .obs describing, for each observation (row),\n\
the sum of the stored values in the columns.\n"
info: null
default:
- "total_counts"
required: false
direction: "input"
multiple: false
multiple_sep: ";"
- type: "string"
name: "--output_var_num_nonzero_obs"
description: "Name of column describing, for each feature, the number of stored\
\ values\n(including explicit zeroes). In other words, the name of the column\
\ that counts\nfor each column the number of rows that contain data.\n"
info: null
default:
- "num_nonzero_obs"
required: false
direction: "input"
multiple: false
multiple_sep: ";"
- type: "string"
name: "--output_var_total_counts_obs"
description: "Name of the column in .var describing, for each feature (column),\n\
the sum of the stored values in the rows.\n"
info: null
default:
- "total_counts"
required: false
direction: "input"
multiple: false
multiple_sep: ";"
- type: "string"
name: "--output_var_obs_mean"
description: "Name of the column in .obs providing the mean of the values in each\
\ row.\n"
info: null
default:
- "obs_mean"
required: false
direction: "input"
multiple: false
multiple_sep: ";"
- type: "string"
name: "--output_var_pct_dropout"
description: "Name of the column in .obs providing for each feature the percentage\
\ of\nobservations the feature does not appear on (i.e. is missing). Same as\
\ `--output_var_num_nonzero_obs`\nbut percentage based.\n"
info: null
default:
- "pct_dropout"
required: false
direction: "input"
multiple: false
multiple_sep: ";"
- name: "Outputs"
arguments:
- type: "file"
name: "--output"
description: "Destination path to the output."
info: null
example:
- "output.h5mu"
must_exist: true
create_parent: true
required: true
direction: "output"
multiple: false
multiple_sep: ";"
resources:
- type: "nextflow_script"
path: "main.nf"
is_executable: true
entrypoint: "run_wf"
- type: "file"
path: "utils"
- type: "file"
path: "nextflow_labels.config"
dest: "nextflow_labels.config"
description: "A pipeline to add basic qc statistics to a MuData "
test_resources:
- type: "nextflow_script"
path: "test.nf"
is_executable: true
entrypoint: "test_wf"
- type: "file"
path: "concat_test_data"
- type: "file"
path: "pbmc_1k_protein_v3"
info:
test_dependencies:
- name: "qc_test"
namespace: "test_workflows/qc"
status: "enabled"
scope:
image: "public"
target: "public"
dependencies:
- name: "metadata/grep_annotation_column"
repository:
type: "local"
- name: "qc/calculate_qc_metrics"
repository:
type: "local"
license: "MIT"
links:
repository: "https://github.com/openpipelines-bio/openpipeline"
docker_registry: "ghcr.io"
runners:
- type: "nextflow"
id: "nextflow"
directives:
tag: "$id"
auto:
simplifyInput: true
simplifyOutput: false
transcript: false
publish: false
config:
labels:
mem1gb: "memory = 1000000000.B"
mem2gb: "memory = 2000000000.B"
mem5gb: "memory = 5000000000.B"
mem10gb: "memory = 10000000000.B"
mem20gb: "memory = 20000000000.B"
mem50gb: "memory = 50000000000.B"
mem100gb: "memory = 100000000000.B"
mem200gb: "memory = 200000000000.B"
mem500gb: "memory = 500000000000.B"
mem1tb: "memory = 1000000000000.B"
mem2tb: "memory = 2000000000000.B"
mem5tb: "memory = 5000000000000.B"
mem10tb: "memory = 10000000000000.B"
mem20tb: "memory = 20000000000000.B"
mem50tb: "memory = 50000000000000.B"
mem100tb: "memory = 100000000000000.B"
mem200tb: "memory = 200000000000000.B"
mem500tb: "memory = 500000000000000.B"
mem1gib: "memory = 1073741824.B"
mem2gib: "memory = 2147483648.B"
mem4gib: "memory = 4294967296.B"
mem8gib: "memory = 8589934592.B"
mem16gib: "memory = 17179869184.B"
mem32gib: "memory = 34359738368.B"
mem64gib: "memory = 68719476736.B"
mem128gib: "memory = 137438953472.B"
mem256gib: "memory = 274877906944.B"
mem512gib: "memory = 549755813888.B"
mem1tib: "memory = 1099511627776.B"
mem2tib: "memory = 2199023255552.B"
mem4tib: "memory = 4398046511104.B"
mem8tib: "memory = 8796093022208.B"
mem16tib: "memory = 17592186044416.B"
mem32tib: "memory = 35184372088832.B"
mem64tib: "memory = 70368744177664.B"
mem128tib: "memory = 140737488355328.B"
mem256tib: "memory = 281474976710656.B"
mem512tib: "memory = 562949953421312.B"
cpu1: "cpus = 1"
cpu2: "cpus = 2"
cpu5: "cpus = 5"
cpu10: "cpus = 10"
cpu20: "cpus = 20"
cpu50: "cpus = 50"
cpu100: "cpus = 100"
cpu200: "cpus = 200"
cpu500: "cpus = 500"
cpu1000: "cpus = 1000"
script:
- "includeConfig(\"nextflow_labels.config\")"
debug: false
container: "docker"
build_info:
config: "src/workflows/qc/qc/config.vsh.yaml"
runner: "nextflow"
engine: "native"
output: "target/nextflow/workflows/qc/qc"
executable: "target/nextflow/workflows/qc/qc/main.nf"
viash_version: "0.9.4"
git_commit: "a0c9522486585774f76416150f8a3291409b5363"
git_remote: "https://github.com/openpipelines-bio/openpipeline"
git_tag: "2.1.1-2-ga0c95224865"
dependencies:
- "target/nextflow/metadata/grep_annotation_column"
- "target/nextflow/qc/calculate_qc_metrics"
package_config:
name: "openpipeline"
summary: "Best-practice workflows for single-cell multi-omics analyses.\n"
description: "OpenPipelines are extensible single cell analysis pipelines for reproducible\
\ and large-scale single cell processing using [Viash](https://viash.io) and [Nextflow](https://www.nextflow.io/).\n\
\nIn terms of workflows, the following has been made available, but keep in mind\
\ that\nindividual tools and functionality can be executed as standalone components\
\ as well.\n\n * Demultiplexing: conversion of raw sequencing data to FASTQ objects.\n\
\ * Ingestion: Read mapping and generating a count matrix.\n * Single sample\
\ processing: cell filtering and doublet detection.\n * Multisample processing:\
\ Count transformation, normalization, QC metric calulations.\n * Integration:\
\ Clustering, integration and batch correction using single and multimodal methods.\n\
\ * Downstream analysis workflows\n"
info:
test_resources:
- type: "s3"
path: "s3://openpipelines-data"
dest: "resources_test"
viash_version: "0.9.4"
source: "src"
target: "target"
config_mods:
- ".resources += {path: '/src/workflows/utils/labels.config', dest: 'nextflow_labels.config'}\n\
.runners[.type == 'nextflow'].config.script := 'includeConfig(\"nextflow_labels.config\"\
)'"
- ".version := \"2.1.2\""
keywords:
- "single-cell"
- "multimodal"
license: "MIT"
organization: "openpipelines-bio"
links:
repository: "https://github.com/openpipelines-bio/openpipeline"
docker_registry: "ghcr.io"
homepage: "https://openpipelines.bio"
documentation: "https://openpipelines.bio/fundamentals"
issue_tracker: "https://github.com/openpipelines-bio/openpipeline/issues"

View File

@@ -0,0 +1,126 @@
manifest {
name = 'workflows/qc/qc'
mainScript = 'main.nf'
nextflowVersion = '!>=20.12.1-edge'
version = '2.1.2'
description = 'A pipeline to add basic qc statistics to a MuData '
author = 'Dries Schaumont'
}
process.container = 'nextflow/bash:latest'
// detect tempdir
tempDir = java.nio.file.Paths.get(
System.getenv('NXF_TEMP') ?:
System.getenv('VIASH_TEMP') ?:
System.getenv('TEMPDIR') ?:
System.getenv('TMPDIR') ?:
'/tmp'
).toAbsolutePath()
profiles {
no_publish {
process {
withName: '.*' {
publishDir = [
enabled: false
]
}
}
}
mount_temp {
docker.temp = tempDir
podman.temp = tempDir
charliecloud.temp = tempDir
}
docker {
docker.enabled = true
// docker.userEmulation = true
singularity.enabled = false
podman.enabled = false
shifter.enabled = false
charliecloud.enabled = false
}
singularity {
singularity.enabled = true
singularity.autoMounts = true
docker.enabled = false
podman.enabled = false
shifter.enabled = false
charliecloud.enabled = false
}
podman {
podman.enabled = true
docker.enabled = false
singularity.enabled = false
shifter.enabled = false
charliecloud.enabled = false
}
shifter {
shifter.enabled = true
docker.enabled = false
singularity.enabled = false
podman.enabled = false
charliecloud.enabled = false
}
charliecloud {
charliecloud.enabled = true
docker.enabled = false
singularity.enabled = false
podman.enabled = false
shifter.enabled = false
}
}
process{
withLabel: mem1gb { memory = 1000000000.B }
withLabel: mem2gb { memory = 2000000000.B }
withLabel: mem5gb { memory = 5000000000.B }
withLabel: mem10gb { memory = 10000000000.B }
withLabel: mem20gb { memory = 20000000000.B }
withLabel: mem50gb { memory = 50000000000.B }
withLabel: mem100gb { memory = 100000000000.B }
withLabel: mem200gb { memory = 200000000000.B }
withLabel: mem500gb { memory = 500000000000.B }
withLabel: mem1tb { memory = 1000000000000.B }
withLabel: mem2tb { memory = 2000000000000.B }
withLabel: mem5tb { memory = 5000000000000.B }
withLabel: mem10tb { memory = 10000000000000.B }
withLabel: mem20tb { memory = 20000000000000.B }
withLabel: mem50tb { memory = 50000000000000.B }
withLabel: mem100tb { memory = 100000000000000.B }
withLabel: mem200tb { memory = 200000000000000.B }
withLabel: mem500tb { memory = 500000000000000.B }
withLabel: mem1gib { memory = 1073741824.B }
withLabel: mem2gib { memory = 2147483648.B }
withLabel: mem4gib { memory = 4294967296.B }
withLabel: mem8gib { memory = 8589934592.B }
withLabel: mem16gib { memory = 17179869184.B }
withLabel: mem32gib { memory = 34359738368.B }
withLabel: mem64gib { memory = 68719476736.B }
withLabel: mem128gib { memory = 137438953472.B }
withLabel: mem256gib { memory = 274877906944.B }
withLabel: mem512gib { memory = 549755813888.B }
withLabel: mem1tib { memory = 1099511627776.B }
withLabel: mem2tib { memory = 2199023255552.B }
withLabel: mem4tib { memory = 4398046511104.B }
withLabel: mem8tib { memory = 8796093022208.B }
withLabel: mem16tib { memory = 17592186044416.B }
withLabel: mem32tib { memory = 35184372088832.B }
withLabel: mem64tib { memory = 70368744177664.B }
withLabel: mem128tib { memory = 140737488355328.B }
withLabel: mem256tib { memory = 281474976710656.B }
withLabel: mem512tib { memory = 562949953421312.B }
withLabel: cpu1 { cpus = 1 }
withLabel: cpu2 { cpus = 2 }
withLabel: cpu5 { cpus = 5 }
withLabel: cpu10 { cpus = 10 }
withLabel: cpu20 { cpus = 20 }
withLabel: cpu50 { cpus = 50 }
withLabel: cpu100 { cpus = 100 }
withLabel: cpu200 { cpus = 200 }
withLabel: cpu500 { cpus = 500 }
withLabel: cpu1000 { cpus = 1000 }
}
includeConfig("nextflow_labels.config")

View File

@@ -0,0 +1,66 @@
process {
// Default resources for components that hardly do any processing
memory = { 2.GB * task.attempt }
cpus = 1
// Retry for exit codes that have something to do with memory issues
errorStrategy = { task.exitStatus in 137..140 ? 'retry' : 'terminate' }
maxRetries = 3
maxMemory = null
// CPU resources
withLabel: singlecpu { cpus = 1 }
withLabel: lowcpu { cpus = 4 }
withLabel: midcpu { cpus = 10 }
withLabel: highcpu { cpus = 20 }
// Memory resources
withLabel: lowmem { memory = { get_memory( 4.GB * task.attempt ) } }
withLabel: midmem { memory = { get_memory( 25.GB * task.attempt ) } }
withLabel: highmem { memory = { get_memory( 50.GB * task.attempt ) } }
withLabel: veryhighmem { memory = { get_memory( 75.GB * task.attempt ) } }
// Disk space
withLabel: lowdisk {
disk = {process.disk ? process.disk : null}
}
withLabel: middisk {
disk = {process.disk ? process.disk : null}
}
withLabel: highdisk {
disk = {process.disk ? process.disk : null}
}
withLabel: veryhighdisk {
disk = {process.disk ? process.disk : null}
}
// NOTE: The above labels intentionally do not have an effect by default.
// The user should set the disk space requirements by adding the following
// to the compute environment:
//
// withLabel: lowdisk { disk = { 20.GB * task.attempt } }
// withLabel: middisk { disk = { 100.GB * task.attempt } }
// withLabel: highdisk { disk = { 200.GB * task.attempt } }
// withLabel: veryhighdisk { disk = { 500.GB * task.attempt } }
}
def get_memory(to_compare) {
if (!process.containsKey("maxMemory") || !process.maxMemory) {
return to_compare
}
try {
if (process.containsKey("maxRetries") && process.maxRetries && task.attempt == (process.maxRetries as int)) {
return process.maxMemory
}
else if (to_compare.compareTo(process.maxMemory as nextflow.util.MemoryUnit) == 1) {
return max_memory as nextflow.util.MemoryUnit
}
else {
return to_compare
}
} catch (all) {
println "Error processing memory resources. Please check that process.maxMemory '${process.maxMemory}' and process.maxRetries '${process.maxRetries}' are valid!"
System.exit(1)
}
}

View File

@@ -0,0 +1,33 @@
# Inputs
id: # please fill in - example: "foo"
input: # please fill in - example: "input.h5mu"
modality: "rna"
# layer: "raw_counts"
# Mitochondrial & Ribosomal Gene Detection
# var_gene_names: "gene_symbol"
# var_name_mitochondrial_genes: "foo"
# obs_name_mitochondrial_fraction: "foo"
mitochondrial_gene_regex: "^[mM][tT]-"
# var_name_ribosomal_genes: "foo"
# obs_name_ribosomal_fraction: "foo"
ribosomal_gene_regex: "^[Mm]?[Rr][Pp][LlSs]"
# QC metrics calculation options
# var_qc_metrics: ["ercc,highly_variable"]
top_n_vars: [50, 100, 200, 500]
output_obs_num_nonzero_vars: "num_nonzero_vars"
output_obs_total_counts_vars: "total_counts"
output_var_num_nonzero_obs: "num_nonzero_obs"
output_var_total_counts_obs: "total_counts"
output_var_obs_mean: "obs_mean"
output_var_pct_dropout: "pct_dropout"
# Outputs
# output: "$id.$key.output.h5mu"
# Nextflow input-output arguments
publish_dir: # please fill in - example: "output/"
# param_list: "my_params.yaml"
# Arguments

View File

@@ -0,0 +1,320 @@
{
"$schema": "http://json-schema.org/draft-07/schema",
"title": "qc",
"description": "A pipeline to add basic qc statistics to a MuData ",
"type": "object",
"definitions": {
"Dataset input": {
"title": "Dataset input",
"type": "object",
"description": "Dataset input using nf-tower \"dataset\" or \"data explorer\". Allows for the input of multiple parameter sets to initialise a Nextflow channel.",
"properties": {
"param_list": {
"description": "Dataset input can either be a list of maps, a csv file, a json file, a yaml file, or simply a yaml blob. The names of the input fields (e.g. csv columns, json keys) need to be an exact match with the workflow input parameters.",
"default": "",
"format": "file-path",
"mimetype": "text/csv",
"pattern": "^\\S+\\.csv$"
}
}
},
"inputs" : {
"title": "Inputs",
"type": "object",
"description": "No description",
"properties": {
"id": {
"type":
"string",
"description": "Type: `string`, required, example: `foo`. ID of the sample",
"help_text": "Type: `string`, required, example: `foo`. ID of the sample."
}
,
"input": {
"type":
"string",
"description": "Type: `file`, required, example: `input.h5mu`. Path to the sample",
"help_text": "Type: `file`, required, example: `input.h5mu`. Path to the sample."
}
,
"modality": {
"type":
"string",
"description": "Type: `string`, default: `rna`. Which modality to process",
"help_text": "Type: `string`, default: `rna`. Which modality to process."
,
"default":"rna"
}
,
"layer": {
"type":
"string",
"description": "Type: `string`, example: `raw_counts`. Layer to calculate qc metrics for",
"help_text": "Type: `string`, example: `raw_counts`. Layer to calculate qc metrics for."
}
}
},
"outputs" : {
"title": "Outputs",
"type": "object",
"description": "No description",
"properties": {
"output": {
"type":
"string",
"description": "Type: `file`, required, default: `$id.$key.output.h5mu`, example: `output.h5mu`. Destination path to the output",
"help_text": "Type: `file`, required, default: `$id.$key.output.h5mu`, example: `output.h5mu`. Destination path to the output."
,
"default":"$id.$key.output.h5mu"
}
}
},
"mitochondrial & ribosomal gene detection" : {
"title": "Mitochondrial & Ribosomal Gene Detection",
"type": "object",
"description": "No description",
"properties": {
"var_gene_names": {
"type":
"string",
"description": "Type: `string`, example: `gene_symbol`. ",
"help_text": "Type: `string`, example: `gene_symbol`. .var column name to be used to detect mitochondrial/ribosomal genes instead of .var_names (default if not set).\nGene names matching with the regex value from --mitochondrial_gene_regex or --ribosomal_gene_regex will be \nidentified as mitochondrial or ribosomal genes, respectively.\n"
}
,
"var_name_mitochondrial_genes": {
"type":
"string",
"description": "Type: `string`. In which ",
"help_text": "Type: `string`. In which .var slot to store a boolean array corresponding the mitochondrial genes.\n"
}
,
"obs_name_mitochondrial_fraction": {
"type":
"string",
"description": "Type: `string`. ",
"help_text": "Type: `string`. .Obs slot to store the fraction of reads found to be mitochondrial. Defaults to \u0027fraction_\u0027 suffixed by the value of --var_name_mitochondrial_genes\n"
}
,
"mitochondrial_gene_regex": {
"type":
"string",
"description": "Type: `string`, default: `^[mM][tT]-`. Regex string that identifies mitochondrial genes from --var_gene_names",
"help_text": "Type: `string`, default: `^[mM][tT]-`. Regex string that identifies mitochondrial genes from --var_gene_names.\nBy default will detect human and mouse mitochondrial genes from a gene symbol.\n"
,
"default":"^[mM][tT]-"
}
,
"var_name_ribosomal_genes": {
"type":
"string",
"description": "Type: `string`. In which ",
"help_text": "Type: `string`. In which .var slot to store a boolean array corresponding the ribosomal genes.\n"
}
,
"obs_name_ribosomal_fraction": {
"type":
"string",
"description": "Type: `string`. When specified, write the fraction of counts originating from ribosomal genes \n(based on --ribosomal_gene_regex) to an ",
"help_text": "Type: `string`. When specified, write the fraction of counts originating from ribosomal genes \n(based on --ribosomal_gene_regex) to an .obs column with the specified name.\nRequires --var_name_ribosomal_genes.\n"
}
,
"ribosomal_gene_regex": {
"type":
"string",
"description": "Type: `string`, default: `^[Mm]?[Rr][Pp][LlSs]`. Regex string that identifies ribosomal genes from --var_gene_names",
"help_text": "Type: `string`, default: `^[Mm]?[Rr][Pp][LlSs]`. Regex string that identifies ribosomal genes from --var_gene_names.\nBy default will detect human and mouse ribosomal genes from a gene symbol.\n"
,
"default":"^[Mm]?[Rr][Pp][LlSs]"
}
}
},
"qc metrics calculation options" : {
"title": "QC metrics calculation options",
"type": "object",
"description": "No description",
"properties": {
"var_qc_metrics": {
"type":
"string",
"description": "Type: List of `string`, example: `ercc,highly_variable`, multiple_sep: `\",\"`. Keys to select a boolean (containing only True or False) column from ",
"help_text": "Type: List of `string`, example: `ercc,highly_variable`, multiple_sep: `\",\"`. Keys to select a boolean (containing only True or False) column from .var.\nFor each cell, calculate the proportion of total values for genes which are labeled \u0027True\u0027, \ncompared to the total sum of the values for all genes. Defaults to the value from\n--var_name_mitochondrial_genes.\n"
}
,
"top_n_vars": {
"type":
"string",
"description": "Type: List of `integer`, default: `50,100,200,500`, multiple_sep: `\",\"`. Number of top vars to be used to calculate cumulative proportions",
"help_text": "Type: List of `integer`, default: `50,100,200,500`, multiple_sep: `\",\"`. Number of top vars to be used to calculate cumulative proportions.\nIf not specified, proportions are not calculated. `--top_n_vars 20,50` finds\ncumulative proportion to the 20th and 50th most expressed vars.\n"
,
"default":"50,100,200,500"
}
,
"output_obs_num_nonzero_vars": {
"type":
"string",
"description": "Type: `string`, default: `num_nonzero_vars`. Name of column in ",
"help_text": "Type: `string`, default: `num_nonzero_vars`. Name of column in .obs describing, for each observation, the number of stored values\n(including explicit zeroes). In other words, the name of the column that counts\nfor each row the number of columns that contain data.\n"
,
"default":"num_nonzero_vars"
}
,
"output_obs_total_counts_vars": {
"type":
"string",
"description": "Type: `string`, default: `total_counts`. Name of the column for ",
"help_text": "Type: `string`, default: `total_counts`. Name of the column for .obs describing, for each observation (row),\nthe sum of the stored values in the columns.\n"
,
"default":"total_counts"
}
,
"output_var_num_nonzero_obs": {
"type":
"string",
"description": "Type: `string`, default: `num_nonzero_obs`. Name of column describing, for each feature, the number of stored values\n(including explicit zeroes)",
"help_text": "Type: `string`, default: `num_nonzero_obs`. Name of column describing, for each feature, the number of stored values\n(including explicit zeroes). In other words, the name of the column that counts\nfor each column the number of rows that contain data.\n"
,
"default":"num_nonzero_obs"
}
,
"output_var_total_counts_obs": {
"type":
"string",
"description": "Type: `string`, default: `total_counts`. Name of the column in ",
"help_text": "Type: `string`, default: `total_counts`. Name of the column in .var describing, for each feature (column),\nthe sum of the stored values in the rows.\n"
,
"default":"total_counts"
}
,
"output_var_obs_mean": {
"type":
"string",
"description": "Type: `string`, default: `obs_mean`. Name of the column in ",
"help_text": "Type: `string`, default: `obs_mean`. Name of the column in .obs providing the mean of the values in each row.\n"
,
"default":"obs_mean"
}
,
"output_var_pct_dropout": {
"type":
"string",
"description": "Type: `string`, default: `pct_dropout`. Name of the column in ",
"help_text": "Type: `string`, default: `pct_dropout`. Name of the column in .obs providing for each feature the percentage of\nobservations the feature does not appear on (i.e. is missing). Same as `--output_var_num_nonzero_obs`\nbut percentage based.\n"
,
"default":"pct_dropout"
}
}
},
"nextflow input-output arguments" : {
"title": "Nextflow input-output arguments",
"type": "object",
"description": "Input/output parameters for Nextflow itself. Please note that both publishDir and publish_dir are supported but at least one has to be configured.",
"properties": {
"publish_dir": {
"type":
"string",
"description": "Type: `string`, required, example: `output/`. Path to an output directory",
"help_text": "Type: `string`, required, example: `output/`. Path to an output directory."
}
}
}
},
"allOf": [
{
"$ref": "#/definitions/inputs"
},
{
"$ref": "#/definitions/outputs"
},
{
"$ref": "#/definitions/mitochondrial & ribosomal gene detection"
},
{
"$ref": "#/definitions/qc metrics calculation options"
},
{
"$ref": "#/definitions/nextflow input-output arguments"
}
]
}

View File

@@ -0,0 +1,36 @@
profiles {
// detect tempdir
tempDir = java.nio.file.Paths.get(
System.getenv('NXF_TEMP') ?:
System.getenv('VIASH_TEMP') ?:
System.getenv('TEMPDIR') ?:
System.getenv('TMPDIR') ?:
'/tmp'
).toAbsolutePath()
mount_temp {
docker.temp = tempDir
podman.temp = tempDir
charliecloud.temp = tempDir
}
no_publish {
process {
withName: '.*' {
publishDir = [
enabled: false
]
}
}
}
docker {
docker.enabled = true
// docker.userEmulation = true
singularity.enabled = false
podman.enabled = false
shifter.enabled = false
charliecloud.enabled = false
}
}

View File

@@ -0,0 +1,66 @@
process {
// Default resources for components that hardly do any processing
memory = { 2.GB * task.attempt }
cpus = 1
// Retry for exit codes that have something to do with memory issues
errorStrategy = { task.exitStatus in 137..140 ? 'retry' : 'terminate' }
maxRetries = 3
maxMemory = null
// CPU resources
withLabel: singlecpu { cpus = 1 }
withLabel: lowcpu { cpus = 4 }
withLabel: midcpu { cpus = 10 }
withLabel: highcpu { cpus = 20 }
// Memory resources
withLabel: lowmem { memory = { get_memory( 4.GB * task.attempt ) } }
withLabel: midmem { memory = { get_memory( 25.GB * task.attempt ) } }
withLabel: highmem { memory = { get_memory( 50.GB * task.attempt ) } }
withLabel: veryhighmem { memory = { get_memory( 75.GB * task.attempt ) } }
// Disk space
withLabel: lowdisk {
disk = {process.disk ? process.disk : null}
}
withLabel: middisk {
disk = {process.disk ? process.disk : null}
}
withLabel: highdisk {
disk = {process.disk ? process.disk : null}
}
withLabel: veryhighdisk {
disk = {process.disk ? process.disk : null}
}
// NOTE: The above labels intentionally do not have an effect by default.
// The user should set the disk space requirements by adding the following
// to the compute environment:
//
// withLabel: lowdisk { disk = { 20.GB * task.attempt } }
// withLabel: middisk { disk = { 100.GB * task.attempt } }
// withLabel: highdisk { disk = { 200.GB * task.attempt } }
// withLabel: veryhighdisk { disk = { 500.GB * task.attempt } }
}
def get_memory(to_compare) {
if (!process.containsKey("maxMemory") || !process.maxMemory) {
return to_compare
}
try {
if (process.containsKey("maxRetries") && process.maxRetries && task.attempt == (process.maxRetries as int)) {
return process.maxMemory
}
else if (to_compare.compareTo(process.maxMemory as nextflow.util.MemoryUnit) == 1) {
return max_memory as nextflow.util.MemoryUnit
}
else {
return to_compare
}
} catch (all) {
println "Error processing memory resources. Please check that process.maxMemory '${process.maxMemory}' and process.maxRetries '${process.maxRetries}' are valid!"
System.exit(1)
}
}

View File

@@ -0,0 +1,33 @@
process {
withLabel: lowmem { memory = 13.Gb }
withLabel: lowcpu { cpus = 4 }
withLabel: midmem { memory = 13.Gb }
withLabel: midcpu { cpus = 4 }
withLabel: highmem { memory = 13.Gb }
withLabel: highcpu { cpus = 4 }
withLabel: veryhighmem { memory = 13.Gb }
withLabel: lowdisk {
disk = {process.disk ? process.disk : null}
}
withLabel: middisk {
disk = {process.disk ? process.disk : null}
}
withLabel: highdisk {
disk = {process.disk ? process.disk : null}
}
withLabel: veryhighdisk {
disk = {process.disk ? process.disk : null}
}
}
env.NUMBA_CACHE_DIR = '/tmp'
trace {
enabled = true
overwrite = true
}
dag {
overwrite = true
}
process.maxForks = 1

View File

@@ -0,0 +1,187 @@
name: "move_files_to_directory"
version: "v0.2.0"
authors:
- name: "Dorien Roosen"
roles:
- "maintainer"
info:
links:
email: "dorien@data-intuitive.com"
github: "dorien-er"
linkedin: "dorien-roosen"
organizations:
- name: "Data Intuitive"
href: "https://www.data-intuitive.com"
role: "Data Scientist"
argument_groups:
- name: "Arguments"
arguments:
- type: "file"
name: "--input"
description: "Paths of the files that will be copied into the output directory."
info: null
must_exist: true
create_parent: true
required: true
direction: "input"
multiple: true
multiple_sep: ";"
- type: "file"
name: "--output"
description: "Path to output directory"
info: null
must_exist: true
create_parent: true
required: true
direction: "output"
multiple: false
multiple_sep: ";"
resources:
- type: "bash_script"
path: "script.sh"
is_executable: true
summary: "Publish one or multiple files to the same directory"
description: "This component copies one or multiple files to the same destination\
\ directory, creating the output directory if it doesn't exist."
test_resources:
- type: "bash_script"
path: "test.sh"
is_executable: true
info: null
status: "enabled"
scope:
image: "public"
target: "public"
requirements:
commands:
- "ps"
license: "MIT"
links:
repository: "https://github.com/viash-hub/craftbox"
runners:
- type: "executable"
id: "executable"
docker_setup_strategy: "ifneedbepullelsecachedbuild"
- type: "nextflow"
id: "nextflow"
directives:
tag: "$id"
auto:
simplifyInput: true
simplifyOutput: false
transcript: false
publish: false
config:
labels:
mem1gb: "memory = 1000000000.B"
mem2gb: "memory = 2000000000.B"
mem5gb: "memory = 5000000000.B"
mem10gb: "memory = 10000000000.B"
mem20gb: "memory = 20000000000.B"
mem50gb: "memory = 50000000000.B"
mem100gb: "memory = 100000000000.B"
mem200gb: "memory = 200000000000.B"
mem500gb: "memory = 500000000000.B"
mem1tb: "memory = 1000000000000.B"
mem2tb: "memory = 2000000000000.B"
mem5tb: "memory = 5000000000000.B"
mem10tb: "memory = 10000000000000.B"
mem20tb: "memory = 20000000000000.B"
mem50tb: "memory = 50000000000000.B"
mem100tb: "memory = 100000000000000.B"
mem200tb: "memory = 200000000000000.B"
mem500tb: "memory = 500000000000000.B"
mem1gib: "memory = 1073741824.B"
mem2gib: "memory = 2147483648.B"
mem4gib: "memory = 4294967296.B"
mem8gib: "memory = 8589934592.B"
mem16gib: "memory = 17179869184.B"
mem32gib: "memory = 34359738368.B"
mem64gib: "memory = 68719476736.B"
mem128gib: "memory = 137438953472.B"
mem256gib: "memory = 274877906944.B"
mem512gib: "memory = 549755813888.B"
mem1tib: "memory = 1099511627776.B"
mem2tib: "memory = 2199023255552.B"
mem4tib: "memory = 4398046511104.B"
mem8tib: "memory = 8796093022208.B"
mem16tib: "memory = 17592186044416.B"
mem32tib: "memory = 35184372088832.B"
mem64tib: "memory = 70368744177664.B"
mem128tib: "memory = 140737488355328.B"
mem256tib: "memory = 281474976710656.B"
mem512tib: "memory = 562949953421312.B"
cpu1: "cpus = 1"
cpu2: "cpus = 2"
cpu5: "cpus = 5"
cpu10: "cpus = 10"
cpu20: "cpus = 20"
cpu50: "cpus = 50"
cpu100: "cpus = 100"
cpu200: "cpus = 200"
cpu500: "cpus = 500"
cpu1000: "cpus = 1000"
debug: false
container: "docker"
engines:
- type: "docker"
id: "docker"
image: "debian:latest"
target_registry: "images.viash-hub.com"
target_tag: "v0.2.0"
namespace_separator: "/"
setup:
- type: "apt"
packages:
- "procps"
interactive: false
entrypoint: []
cmd: null
- type: "native"
id: "native"
build_info:
config: "src/move_files_to_directory/config.vsh.yaml"
runner: "nextflow"
engine: "docker|native"
output: "target/nextflow/move_files_to_directory"
executable: "target/nextflow/move_files_to_directory/main.nf"
viash_version: "0.9.4"
git_commit: "1c1b0a4a1aff891ab678072b0ba915ac3ac71610"
git_remote: "https://github.com/viash-hub/craftbox"
git_tag: "v0.1.0-8-g1c1b0a4"
package_config:
name: "craftbox"
version: "v0.2.0"
summary: "A collection of custom-tailored scripts and applied utilities built with\
\ Viash.\n"
description: "`craftbox` is a curated collection of custom scripts and utilities\
\ designed to tackle context-specific tasks.\n\nEmphasizing the Viash principles,\
\ `craftbox` components aim for **reusability**, **reproducibility**, and adherence\
\ to **best practices**. Key features generally include:\n\n* **Standalone & Nextflow\
\ Ready:** Components are built to run directly via the command line or be smoothly\
\ integrated into Nextflow workflows.\n* **Custom Implementations:** Contains\
\ scripts and tools developed for particular tasks that may not be found in broader\
\ collections.\n* **High Quality Standards (promoted by Viash):**\n * Clear\
\ documentation for components and their parameters.\n * Full exposure of underlying\
\ script/tool arguments for fine-grained control.\n * Containerized (Docker)\
\ to ensure dependency management and a consistent, reproducible runtime environment.\n\
\ * Unit tested where applicable to ensure components function as expected.\n"
info: null
viash_version: "0.9.4"
source: "src"
target: "target"
config_mods:
- ".requirements.commands := ['ps']\n"
- ".engines += { type: \"native\" }"
- ".engines[.type == 'docker'].target_registry := 'images.viash-hub.com'"
- ".engines[.type == 'docker'].target_tag := 'v0.2.0'"
keywords:
- "scripts"
- "custom"
- "implementations"
- "utilities"
license: "MIT"
organization: "vsh"
links:
repository: "https://github.com/viash-hub/craftbox"
issue_tracker: "https://github.com/viash-hub/craftbox/issues"

View File

@@ -0,0 +1,126 @@
manifest {
name = 'move_files_to_directory'
mainScript = 'main.nf'
nextflowVersion = '!>=20.12.1-edge'
version = 'v0.2.0'
description = 'This component copies one or multiple files to the same destination directory, creating the output directory if it doesn\'t exist.'
author = 'Dorien Roosen'
}
process.container = 'nextflow/bash:latest'
// detect tempdir
tempDir = java.nio.file.Paths.get(
System.getenv('NXF_TEMP') ?:
System.getenv('VIASH_TEMP') ?:
System.getenv('TEMPDIR') ?:
System.getenv('TMPDIR') ?:
'/tmp'
).toAbsolutePath()
profiles {
no_publish {
process {
withName: '.*' {
publishDir = [
enabled: false
]
}
}
}
mount_temp {
docker.temp = tempDir
podman.temp = tempDir
charliecloud.temp = tempDir
}
docker {
docker.enabled = true
// docker.userEmulation = true
singularity.enabled = false
podman.enabled = false
shifter.enabled = false
charliecloud.enabled = false
}
singularity {
singularity.enabled = true
singularity.autoMounts = true
docker.enabled = false
podman.enabled = false
shifter.enabled = false
charliecloud.enabled = false
}
podman {
podman.enabled = true
docker.enabled = false
singularity.enabled = false
shifter.enabled = false
charliecloud.enabled = false
}
shifter {
shifter.enabled = true
docker.enabled = false
singularity.enabled = false
podman.enabled = false
charliecloud.enabled = false
}
charliecloud {
charliecloud.enabled = true
docker.enabled = false
singularity.enabled = false
podman.enabled = false
shifter.enabled = false
}
}
process{
withLabel: mem1gb { memory = 1000000000.B }
withLabel: mem2gb { memory = 2000000000.B }
withLabel: mem5gb { memory = 5000000000.B }
withLabel: mem10gb { memory = 10000000000.B }
withLabel: mem20gb { memory = 20000000000.B }
withLabel: mem50gb { memory = 50000000000.B }
withLabel: mem100gb { memory = 100000000000.B }
withLabel: mem200gb { memory = 200000000000.B }
withLabel: mem500gb { memory = 500000000000.B }
withLabel: mem1tb { memory = 1000000000000.B }
withLabel: mem2tb { memory = 2000000000000.B }
withLabel: mem5tb { memory = 5000000000000.B }
withLabel: mem10tb { memory = 10000000000000.B }
withLabel: mem20tb { memory = 20000000000000.B }
withLabel: mem50tb { memory = 50000000000000.B }
withLabel: mem100tb { memory = 100000000000000.B }
withLabel: mem200tb { memory = 200000000000000.B }
withLabel: mem500tb { memory = 500000000000000.B }
withLabel: mem1gib { memory = 1073741824.B }
withLabel: mem2gib { memory = 2147483648.B }
withLabel: mem4gib { memory = 4294967296.B }
withLabel: mem8gib { memory = 8589934592.B }
withLabel: mem16gib { memory = 17179869184.B }
withLabel: mem32gib { memory = 34359738368.B }
withLabel: mem64gib { memory = 68719476736.B }
withLabel: mem128gib { memory = 137438953472.B }
withLabel: mem256gib { memory = 274877906944.B }
withLabel: mem512gib { memory = 549755813888.B }
withLabel: mem1tib { memory = 1099511627776.B }
withLabel: mem2tib { memory = 2199023255552.B }
withLabel: mem4tib { memory = 4398046511104.B }
withLabel: mem8tib { memory = 8796093022208.B }
withLabel: mem16tib { memory = 17592186044416.B }
withLabel: mem32tib { memory = 35184372088832.B }
withLabel: mem64tib { memory = 70368744177664.B }
withLabel: mem128tib { memory = 140737488355328.B }
withLabel: mem256tib { memory = 281474976710656.B }
withLabel: mem512tib { memory = 562949953421312.B }
withLabel: cpu1 { cpus = 1 }
withLabel: cpu2 { cpus = 2 }
withLabel: cpu5 { cpus = 5 }
withLabel: cpu10 { cpus = 10 }
withLabel: cpu20 { cpus = 20 }
withLabel: cpu50 { cpus = 50 }
withLabel: cpu100 { cpus = 100 }
withLabel: cpu200 { cpus = 200 }
withLabel: cpu500 { cpus = 500 }
withLabel: cpu1000 { cpus = 1000 }
}

View File

@@ -0,0 +1,81 @@
{
"$schema": "http://json-schema.org/draft-07/schema",
"title": "move_files_to_directory",
"description": "This component copies one or multiple files to the same destination directory, creating the output directory if it doesn\u0027t exist.",
"type": "object",
"definitions": {
"arguments" : {
"title": "Arguments",
"type": "object",
"description": "No description",
"properties": {
"input": {
"type":
"string",
"description": "Type: List of `file`, required, multiple_sep: `\";\"`. Paths of the files that will be copied into the output directory",
"help_text": "Type: List of `file`, required, multiple_sep: `\";\"`. Paths of the files that will be copied into the output directory."
}
,
"output": {
"type":
"string",
"description": "Type: `file`, required, default: `$id.$key.output`. Path to output directory",
"help_text": "Type: `file`, required, default: `$id.$key.output`. Path to output directory"
,
"default":"$id.$key.output"
}
}
},
"nextflow input-output arguments" : {
"title": "Nextflow input-output arguments",
"type": "object",
"description": "Input/output parameters for Nextflow itself. Please note that both publishDir and publish_dir are supported but at least one has to be configured.",
"properties": {
"publish_dir": {
"type":
"string",
"description": "Type: `string`, required, example: `output/`. Path to an output directory",
"help_text": "Type: `string`, required, example: `output/`. Path to an output directory."
}
,
"param_list": {
"type":
"string",
"description": "Type: `string`, example: `my_params.yaml`. Allows inputting multiple parameter sets to initialise a Nextflow channel",
"help_text": "Type: `string`, example: `my_params.yaml`. Allows inputting multiple parameter sets to initialise a Nextflow channel. A `param_list` can either be a list of maps, a csv file, a json file, a yaml file, or simply a yaml blob.\n\n* A list of maps (as-is) where the keys of each map corresponds to the arguments of the pipeline. Example: in a `nextflow.config` file: `param_list: [ [\u0027id\u0027: \u0027foo\u0027, \u0027input\u0027: \u0027foo.txt\u0027], [\u0027id\u0027: \u0027bar\u0027, \u0027input\u0027: \u0027bar.txt\u0027] ]`.\n* A csv file should have column names which correspond to the different arguments of this pipeline. Example: `--param_list data.csv` with columns `id,input`.\n* A json or a yaml file should be a list of maps, each of which has keys corresponding to the arguments of the pipeline. Example: `--param_list data.json` with contents `[ {\u0027id\u0027: \u0027foo\u0027, \u0027input\u0027: \u0027foo.txt\u0027}, {\u0027id\u0027: \u0027bar\u0027, \u0027input\u0027: \u0027bar.txt\u0027} ]`.\n* A yaml blob can also be passed directly as a string. Example: `--param_list \"[ {\u0027id\u0027: \u0027foo\u0027, \u0027input\u0027: \u0027foo.txt\u0027}, {\u0027id\u0027: \u0027bar\u0027, \u0027input\u0027: \u0027bar.txt\u0027} ]\"`.\n\nWhen passing a csv, json or yaml file, relative path names are relativized to the location of the parameter file. No relativation is performed when `param_list` is a list of maps (as-is) or a yaml blob.",
"hidden": true
}
}
}
},
"allOf": [
{
"$ref": "#/definitions/arguments"
},
{
"$ref": "#/definitions/nextflow input-output arguments"
}
]
}

View File

@@ -0,0 +1,266 @@
name: "detect_ingestion_method"
namespace: "ingestion_qc"
version: "v0.1.0"
authors:
- name: "Dorien Roosen"
roles:
- "author"
info:
role: "Core Team Member"
links:
email: "dorien@data-intuitive.com"
github: "dorien-er"
linkedin: "dorien-roosen"
organizations:
- name: "Data Intuitive"
href: "https://www.data-intuitive.com"
role: "Data Scientist"
- name: "Weiwei Schultz"
roles:
- "contributor"
info:
role: "Contributor"
organizations:
- name: "Janssen R&D US"
role: "Associate Director Data Sciences"
argument_groups:
- name: "Inputs"
arguments:
- type: "file"
name: "--input"
description: "The input h5mu file(s)"
info: null
example:
- "path/to/file.h5mu"
must_exist: true
create_parent: true
required: true
direction: "input"
multiple: false
multiple_sep: ";"
- type: "string"
name: "--modality"
description: "The modality to use"
info: null
default:
- "rna"
required: false
direction: "input"
multiple: false
multiple_sep: ";"
- name: "Outputs"
arguments:
- type: "string"
name: "--output_uns_ingestion_method"
description: "The .uns field in which to store the exprimental setup. Values stored\
\ are `cellranger_multi`, `xenium` or `cosmx`."
info: null
default:
- "ingestion_method"
required: false
direction: "input"
multiple: false
multiple_sep: ";"
- type: "file"
name: "--output"
description: "The output h5mu file, containing an .uns field with experiment description."
info: null
example:
- "path/to/file.h5mu"
must_exist: true
create_parent: true
required: true
direction: "output"
multiple: false
multiple_sep: ";"
resources:
- type: "python_script"
path: "script.py"
is_executable: true
- type: "file"
path: "setup_logger.py"
- type: "file"
path: "nextflow_labels.config"
dest: "nextflow_labels.config"
description: "Detects the ingestion method of a dataset.\nCurrently detects either\
\ 10X CellRanger Multi, 10X Xenium or Nanostring CosMx, but can be extended to other\
\ technologies upon request.\n"
test_resources:
- type: "python_script"
path: "test.py"
is_executable: true
- type: "file"
path: "sample_one.qc.h5mu"
- type: "file"
path: "Lung5_Rep2_tiny.qc.h5mu"
- type: "file"
path: "xenium_tiny.qc.h5mu"
info: null
status: "enabled"
scope:
image: "public"
target: "public"
requirements:
commands:
- "ps"
repositories:
- type: "github"
name: "openpipeline"
repo: "openpipelines-bio/openpipeline"
tag: "2.1.2"
- type: "vsh"
name: "craftbox"
repo: "craftbox"
tag: "v0.2.0"
links:
repository: "https://github.com/openpipelines-bio/openpipeline_qc"
docker_registry: "ghcr.io"
runners:
- type: "executable"
id: "executable"
docker_setup_strategy: "ifneedbepullelsecachedbuild"
- type: "nextflow"
id: "nextflow"
directives:
label:
- "lowmem"
- "lowdisk"
tag: "$id"
auto:
simplifyInput: true
simplifyOutput: false
transcript: false
publish: false
config:
labels:
mem1gb: "memory = 1000000000.B"
mem2gb: "memory = 2000000000.B"
mem5gb: "memory = 5000000000.B"
mem10gb: "memory = 10000000000.B"
mem20gb: "memory = 20000000000.B"
mem50gb: "memory = 50000000000.B"
mem100gb: "memory = 100000000000.B"
mem200gb: "memory = 200000000000.B"
mem500gb: "memory = 500000000000.B"
mem1tb: "memory = 1000000000000.B"
mem2tb: "memory = 2000000000000.B"
mem5tb: "memory = 5000000000000.B"
mem10tb: "memory = 10000000000000.B"
mem20tb: "memory = 20000000000000.B"
mem50tb: "memory = 50000000000000.B"
mem100tb: "memory = 100000000000000.B"
mem200tb: "memory = 200000000000000.B"
mem500tb: "memory = 500000000000000.B"
mem1gib: "memory = 1073741824.B"
mem2gib: "memory = 2147483648.B"
mem4gib: "memory = 4294967296.B"
mem8gib: "memory = 8589934592.B"
mem16gib: "memory = 17179869184.B"
mem32gib: "memory = 34359738368.B"
mem64gib: "memory = 68719476736.B"
mem128gib: "memory = 137438953472.B"
mem256gib: "memory = 274877906944.B"
mem512gib: "memory = 549755813888.B"
mem1tib: "memory = 1099511627776.B"
mem2tib: "memory = 2199023255552.B"
mem4tib: "memory = 4398046511104.B"
mem8tib: "memory = 8796093022208.B"
mem16tib: "memory = 17592186044416.B"
mem32tib: "memory = 35184372088832.B"
mem64tib: "memory = 70368744177664.B"
mem128tib: "memory = 140737488355328.B"
mem256tib: "memory = 281474976710656.B"
mem512tib: "memory = 562949953421312.B"
cpu1: "cpus = 1"
cpu2: "cpus = 2"
cpu5: "cpus = 5"
cpu10: "cpus = 10"
cpu20: "cpus = 20"
cpu50: "cpus = 50"
cpu100: "cpus = 100"
cpu200: "cpus = 200"
cpu500: "cpus = 500"
cpu1000: "cpus = 1000"
script:
- "includeConfig(\"nextflow_labels.config\")"
debug: false
container: "docker"
engines:
- type: "docker"
id: "docker"
image: "python:3.12-slim"
target_registry: "images.viash-hub.com"
target_tag: "v0.1.0"
namespace_separator: "/"
setup:
- type: "apt"
packages:
- "procps"
interactive: false
- type: "python"
user: false
packages:
- "anndata~=0.11.1"
- "mudata~=0.3.1"
script:
- "exec(\"try:\\n import awkward\\nexcept ModuleNotFoundError:\\n exit(0)\\\
nelse: exit(1)\")"
upgrade: true
test_setup:
- type: "apt"
packages:
- "git"
interactive: false
- type: "python"
user: false
packages:
- "viashpy==0.8.0"
github:
- "openpipelines-bio/core#subdirectory=packages/python/openpipeline_testutils"
upgrade: true
entrypoint: []
cmd: null
- type: "native"
id: "native"
build_info:
config: "src/ingestion_qc/detect_ingestion_method/config.vsh.yaml"
runner: "executable"
engine: "docker|native"
output: "target/executable/ingestion_qc/detect_ingestion_method"
executable: "target/executable/ingestion_qc/detect_ingestion_method/detect_ingestion_method"
viash_version: "0.9.4"
git_commit: "4de00a2614069bdaee27943e73a51d378e465c60"
git_remote: "https://github.com/openpipelines-bio/openpipeline_qc"
git_tag: "v0.1.0"
package_config:
name: "openpipeline_qc"
version: "v0.1.0"
info:
test_resources:
- type: "s3"
path: "s3://openpipelines-bio/openpipeline_incubator/resources_test"
dest: "resources_test"
repositories:
- type: "github"
name: "openpipeline"
repo: "openpipelines-bio/openpipeline"
tag: "2.1.2"
- type: "vsh"
name: "craftbox"
repo: "craftbox"
tag: "v0.2.0"
viash_version: "0.9.4"
source: "src"
target: "target"
config_mods:
- ".requirements.commands := ['ps']\n.runners[.type == 'nextflow'].directives.tag\
\ := '$id'\n.resources += {path: '/src/configs/labels.config', dest: 'nextflow_labels.config'}\n\
.runners[.type == 'nextflow'].config.script := 'includeConfig(\"nextflow_labels.config\"\
)'"
- ".engines += { type: \"native\" }"
- ".engines[.type == 'docker'].target_registry := 'images.viash-hub.com'"
- ".engines[.type == 'docker'].target_tag := 'v0.1.0'"
organization: "vsh"
links:
repository: "https://github.com/openpipelines-bio/openpipeline_qc"
docker_registry: "ghcr.io"

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,66 @@
process {
// Default resources for components that hardly do any processing
memory = { 2.GB * task.attempt }
cpus = 1
// Retry for exit codes that have something to do with memory issues
errorStrategy = { task.exitStatus in 137..140 ? 'retry' : 'terminate' }
maxRetries = 3
maxMemory = null
// CPU resources
withLabel: singlecpu { cpus = 1 }
withLabel: lowcpu { cpus = 4 }
withLabel: midcpu { cpus = 10 }
withLabel: highcpu { cpus = 20 }
// Memory resources
withLabel: verylowmem { memory = { get_memory( 4.GB * task.attempt ) } }
withLabel: lowmem { memory = { get_memory( 8.GB * task.attempt ) } }
withLabel: midmem { memory = { get_memory( 16.GB * task.attempt ) } }
withLabel: highmem { memory = { get_memory( 64.GB * task.attempt ) } }
withLabel: veryhighmem { memory = { get_memory( 75.GB * task.attempt ) } }
// Disk space
withLabel: lowdisk {
disk = {process.disk ? process.disk : null}
}
withLabel: middisk {
disk = {process.disk ? process.disk : null}
}
withLabel: highdisk {
disk = {process.disk ? process.disk : null}
}
withLabel: veryhighdisk {
disk = {process.disk ? process.disk : null}
}
// NOTE: The above labels intentionally do not have an effect by default.
// The user should set the disk space requirements by adding the following
// to the compute environment:
//
// withLabel: lowdisk { disk = { 20.GB * task.attempt } }
// withLabel: middisk { disk = { 100.GB * task.attempt } }
// withLabel: highdisk { disk = { 200.GB * task.attempt } }
// withLabel: veryhighdisk { disk = { 500.GB * task.attempt } }
}
def get_memory(to_compare) {
if (!process.containsKey("maxMemory") || !process.maxMemory) {
return to_compare
}
try {
if (process.containsKey("maxRetries") && process.maxRetries && task.attempt == (process.maxRetries as int)) {
return process.maxMemory
}
else if (to_compare.compareTo(process.maxMemory as nextflow.util.MemoryUnit) == 1) {
return max_memory as nextflow.util.MemoryUnit
}
else {
return to_compare
}
} catch (all) {
println "Error processing memory resources. Please check that process.maxMemory '${process.maxMemory}' and process.maxRetries '${process.maxRetries}' are valid!"
System.exit(1)
}
}

View File

@@ -0,0 +1,12 @@
def setup_logger():
import logging
from sys import stdout
logger = logging.getLogger()
logger.setLevel(logging.INFO)
console_handler = logging.StreamHandler(stdout)
logFormatter = logging.Formatter("%(asctime)s %(levelname)-8s %(message)s")
console_handler.setFormatter(logFormatter)
logger.addHandler(console_handler)
return logger

View File

@@ -0,0 +1,268 @@
name: "generate_html"
namespace: "ingestion_qc"
version: "v0.1.0"
authors:
- name: "Jakub Majercik"
roles:
- "author"
info:
role: "Contributor"
links:
email: "jakub@data-intuitive.com"
github: "jakubmajercik"
linkedin: "jakubmajercik"
organizations:
- name: "Data Intuitive"
href: "https://www.data-intuitive.com"
role: "Bioinformatics Engineer"
- name: "Dorien Roosen"
roles:
- "author"
info:
role: "Core Team Member"
links:
email: "dorien@data-intuitive.com"
github: "dorien-er"
linkedin: "dorien-roosen"
organizations:
- name: "Data Intuitive"
href: "https://www.data-intuitive.com"
role: "Data Scientist"
- name: "Robrecht Cannoodt"
roles:
- "author"
info:
role: "Core Team Member"
links:
email: "robrecht@data-intuitive.com"
github: "rcannood"
orcid: "0000-0003-3641-729X"
linkedin: "robrechtcannoodt"
organizations:
- name: "Data Intuitive"
href: "https://www.data-intuitive.com"
role: "Data Science Engineer"
- name: "Open Problems"
href: "https://openproblems.bio"
role: "Core Member"
- name: "Weiwei Schultz"
roles:
- "contributor"
info:
role: "Contributor"
organizations:
- name: "Janssen R&D US"
role: "Associate Director Data Sciences"
argument_groups:
- name: "Inputs"
arguments:
- type: "file"
name: "--input_data"
description: "The input JSON file containing the QC metrics"
info: null
example:
- "path/to/file.json"
must_exist: true
create_parent: true
required: true
direction: "input"
multiple: false
multiple_sep: ";"
- type: "file"
name: "--input_structure"
description: "The input JSON file containing the structure of the data"
info: null
example:
- "path/to/file.json"
must_exist: true
create_parent: true
required: true
direction: "input"
multiple: false
multiple_sep: ";"
- name: "Outputs"
arguments:
- type: "file"
name: "--output_qc_report"
description: "The output HTML report"
info: null
example:
- "path/to/file.html"
must_exist: true
create_parent: true
required: true
direction: "output"
multiple: false
multiple_sep: ";"
resources:
- type: "bash_script"
path: "script.sh"
is_executable: true
- type: "file"
path: "nextflow_labels.config"
dest: "nextflow_labels.config"
description: "Generate an HTML report from the QC metrics"
test_resources:
- type: "bash_script"
path: "test.sh"
is_executable: true
- type: "file"
path: "sc_dataset.json"
- type: "file"
path: "sc_report_structure.json"
- type: "file"
path: "xenium_dataset.json"
- type: "file"
path: "xenium_report_structure.json"
info: null
status: "enabled"
scope:
image: "public"
target: "public"
requirements:
commands:
- "ps"
repositories:
- type: "github"
name: "openpipeline"
repo: "openpipelines-bio/openpipeline"
tag: "2.1.2"
- type: "vsh"
name: "craftbox"
repo: "craftbox"
tag: "v0.2.0"
links:
repository: "https://github.com/openpipelines-bio/openpipeline_qc"
docker_registry: "ghcr.io"
runners:
- type: "executable"
id: "executable"
docker_setup_strategy: "ifneedbepullelsecachedbuild"
- type: "nextflow"
id: "nextflow"
directives:
label:
- "lowmem"
- "lowdisk"
tag: "$id"
auto:
simplifyInput: true
simplifyOutput: false
transcript: false
publish: false
config:
labels:
mem1gb: "memory = 1000000000.B"
mem2gb: "memory = 2000000000.B"
mem5gb: "memory = 5000000000.B"
mem10gb: "memory = 10000000000.B"
mem20gb: "memory = 20000000000.B"
mem50gb: "memory = 50000000000.B"
mem100gb: "memory = 100000000000.B"
mem200gb: "memory = 200000000000.B"
mem500gb: "memory = 500000000000.B"
mem1tb: "memory = 1000000000000.B"
mem2tb: "memory = 2000000000000.B"
mem5tb: "memory = 5000000000000.B"
mem10tb: "memory = 10000000000000.B"
mem20tb: "memory = 20000000000000.B"
mem50tb: "memory = 50000000000000.B"
mem100tb: "memory = 100000000000000.B"
mem200tb: "memory = 200000000000000.B"
mem500tb: "memory = 500000000000000.B"
mem1gib: "memory = 1073741824.B"
mem2gib: "memory = 2147483648.B"
mem4gib: "memory = 4294967296.B"
mem8gib: "memory = 8589934592.B"
mem16gib: "memory = 17179869184.B"
mem32gib: "memory = 34359738368.B"
mem64gib: "memory = 68719476736.B"
mem128gib: "memory = 137438953472.B"
mem256gib: "memory = 274877906944.B"
mem512gib: "memory = 549755813888.B"
mem1tib: "memory = 1099511627776.B"
mem2tib: "memory = 2199023255552.B"
mem4tib: "memory = 4398046511104.B"
mem8tib: "memory = 8796093022208.B"
mem16tib: "memory = 17592186044416.B"
mem32tib: "memory = 35184372088832.B"
mem64tib: "memory = 70368744177664.B"
mem128tib: "memory = 140737488355328.B"
mem256tib: "memory = 281474976710656.B"
mem512tib: "memory = 562949953421312.B"
cpu1: "cpus = 1"
cpu2: "cpus = 2"
cpu5: "cpus = 5"
cpu10: "cpus = 10"
cpu20: "cpus = 20"
cpu50: "cpus = 50"
cpu100: "cpus = 100"
cpu200: "cpus = 200"
cpu500: "cpus = 500"
cpu1000: "cpus = 1000"
script:
- "includeConfig(\"nextflow_labels.config\")"
debug: false
container: "docker"
engines:
- type: "docker"
id: "docker"
image: "node:latest"
target_registry: "images.viash-hub.com"
target_tag: "v0.1.0"
namespace_separator: "/"
setup:
- type: "apt"
packages:
- "git"
interactive: false
- type: "docker"
run:
- "npm install -g pnpm@latest-10 \\\n&& cd /opt && git clone -b v0.1.0 https://github.com/openpipelines-bio/siqc.git\
\ \\\n&& cd siqc && pnpm install \\\n&& true\n"
entrypoint: []
cmd: null
- type: "native"
id: "native"
build_info:
config: "src/ingestion_qc/generate_html/config.vsh.yaml"
runner: "executable"
engine: "docker|native"
output: "target/executable/ingestion_qc/generate_html"
executable: "target/executable/ingestion_qc/generate_html/generate_html"
viash_version: "0.9.4"
git_commit: "4de00a2614069bdaee27943e73a51d378e465c60"
git_remote: "https://github.com/openpipelines-bio/openpipeline_qc"
git_tag: "v0.1.0"
package_config:
name: "openpipeline_qc"
version: "v0.1.0"
info:
test_resources:
- type: "s3"
path: "s3://openpipelines-bio/openpipeline_incubator/resources_test"
dest: "resources_test"
repositories:
- type: "github"
name: "openpipeline"
repo: "openpipelines-bio/openpipeline"
tag: "2.1.2"
- type: "vsh"
name: "craftbox"
repo: "craftbox"
tag: "v0.2.0"
viash_version: "0.9.4"
source: "src"
target: "target"
config_mods:
- ".requirements.commands := ['ps']\n.runners[.type == 'nextflow'].directives.tag\
\ := '$id'\n.resources += {path: '/src/configs/labels.config', dest: 'nextflow_labels.config'}\n\
.runners[.type == 'nextflow'].config.script := 'includeConfig(\"nextflow_labels.config\"\
)'"
- ".engines += { type: \"native\" }"
- ".engines[.type == 'docker'].target_registry := 'images.viash-hub.com'"
- ".engines[.type == 'docker'].target_tag := 'v0.1.0'"
organization: "vsh"
links:
repository: "https://github.com/openpipelines-bio/openpipeline_qc"
docker_registry: "ghcr.io"

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,66 @@
process {
// Default resources for components that hardly do any processing
memory = { 2.GB * task.attempt }
cpus = 1
// Retry for exit codes that have something to do with memory issues
errorStrategy = { task.exitStatus in 137..140 ? 'retry' : 'terminate' }
maxRetries = 3
maxMemory = null
// CPU resources
withLabel: singlecpu { cpus = 1 }
withLabel: lowcpu { cpus = 4 }
withLabel: midcpu { cpus = 10 }
withLabel: highcpu { cpus = 20 }
// Memory resources
withLabel: verylowmem { memory = { get_memory( 4.GB * task.attempt ) } }
withLabel: lowmem { memory = { get_memory( 8.GB * task.attempt ) } }
withLabel: midmem { memory = { get_memory( 16.GB * task.attempt ) } }
withLabel: highmem { memory = { get_memory( 64.GB * task.attempt ) } }
withLabel: veryhighmem { memory = { get_memory( 75.GB * task.attempt ) } }
// Disk space
withLabel: lowdisk {
disk = {process.disk ? process.disk : null}
}
withLabel: middisk {
disk = {process.disk ? process.disk : null}
}
withLabel: highdisk {
disk = {process.disk ? process.disk : null}
}
withLabel: veryhighdisk {
disk = {process.disk ? process.disk : null}
}
// NOTE: The above labels intentionally do not have an effect by default.
// The user should set the disk space requirements by adding the following
// to the compute environment:
//
// withLabel: lowdisk { disk = { 20.GB * task.attempt } }
// withLabel: middisk { disk = { 100.GB * task.attempt } }
// withLabel: highdisk { disk = { 200.GB * task.attempt } }
// withLabel: veryhighdisk { disk = { 500.GB * task.attempt } }
}
def get_memory(to_compare) {
if (!process.containsKey("maxMemory") || !process.maxMemory) {
return to_compare
}
try {
if (process.containsKey("maxRetries") && process.maxRetries && task.attempt == (process.maxRetries as int)) {
return process.maxMemory
}
else if (to_compare.compareTo(process.maxMemory as nextflow.util.MemoryUnit) == 1) {
return max_memory as nextflow.util.MemoryUnit
}
else {
return to_compare
}
} catch (all) {
println "Error processing memory resources. Please check that process.maxMemory '${process.maxMemory}' and process.maxRetries '${process.maxRetries}' are valid!"
System.exit(1)
}
}

Some files were not shown because too many files have changed in this diff Show More