Build branch openpipeline_qc/v0.1.0 with version v0.1.0 to openpipeline_qc on branch v0.1 (4de00a2)
Build pipeline: test-vsh-ci-build-template-8gzht
Source commit: 4de00a2614
Source message: release v0.1
This commit is contained in:
27
.gitignore
vendored
Normal file
27
.gitignore
vendored
Normal file
@@ -0,0 +1,27 @@
|
||||
# IDEs and editors
|
||||
/.idea
|
||||
.project
|
||||
.classpath
|
||||
*.launch
|
||||
.settings/
|
||||
.vscode
|
||||
|
||||
# Temp
|
||||
gitignore
|
||||
test_results
|
||||
|
||||
# System Files
|
||||
.DS_Store
|
||||
Thumbs.db
|
||||
|
||||
# Nextflow
|
||||
work
|
||||
.nextflow*
|
||||
trace*.txt
|
||||
|
||||
# viash
|
||||
/resources_test/
|
||||
|
||||
# pycache
|
||||
*__pycache__*
|
||||
|
||||
3
CHANGELOG.MD
Normal file
3
CHANGELOG.MD
Normal file
@@ -0,0 +1,3 @@
|
||||
# openpipeline_qc x.x.x
|
||||
|
||||
Initial release containing a QC Reporting workflow for Xenium or CellRanger Multi ingested data, with corresponding components.
|
||||
21
LICENSE
Normal file
21
LICENSE
Normal file
@@ -0,0 +1,21 @@
|
||||
MIT License
|
||||
|
||||
Copyright (c) 2025 openpipelines-bio
|
||||
|
||||
Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
of this software and associated documentation files (the "Software"), to deal
|
||||
in the Software without restriction, including without limitation the rights
|
||||
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
copies of the Software, and to permit persons to whom the Software is
|
||||
furnished to do so, subject to the following conditions:
|
||||
|
||||
The above copyright notice and this permission notice shall be included in all
|
||||
copies or substantial portions of the Software.
|
||||
|
||||
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
||||
SOFTWARE.
|
||||
28
_viash.yaml
Normal file
28
_viash.yaml
Normal file
@@ -0,0 +1,28 @@
|
||||
viash_version: 0.9.4
|
||||
version: v0.1.0
|
||||
source: src
|
||||
target: target
|
||||
name: openpipeline_qc
|
||||
organization: vsh
|
||||
links:
|
||||
repository: https://github.com/openpipelines-bio/openpipeline_qc
|
||||
docker_registry: ghcr.io
|
||||
repositories:
|
||||
- name: openpipeline
|
||||
repo: openpipelines-bio/openpipeline
|
||||
type: github
|
||||
tag: 2.1.2
|
||||
- name: craftbox
|
||||
repo: craftbox
|
||||
type: vsh
|
||||
tag: v0.2.0
|
||||
info:
|
||||
test_resources:
|
||||
- type: s3
|
||||
path: s3://openpipelines-bio/openpipeline_incubator/resources_test
|
||||
dest: resources_test
|
||||
config_mods: |
|
||||
.requirements.commands := ['ps']
|
||||
.runners[.type == 'nextflow'].directives.tag := '$id'
|
||||
.resources += {path: '/src/configs/labels.config', dest: 'nextflow_labels.config'}
|
||||
.runners[.type == 'nextflow'].config.script := 'includeConfig("nextflow_labels.config")'
|
||||
0
nextflow.config
Normal file
0
nextflow.config
Normal file
166
resources_test_scripts/qc_sample_data.sh
Executable file
166
resources_test_scripts/qc_sample_data.sh
Executable file
@@ -0,0 +1,166 @@
|
||||
#/bin/bash
|
||||
|
||||
OUT_DIR=resources_test/qc_sample_data
|
||||
OUT_DIR_SPATIAL=resources_test/spatial_qc_sample_data
|
||||
|
||||
[ ! -d "$OUT_DIR" ] && mkdir -p "$OUT_DIR"
|
||||
[ ! -d "$OUT_DIR_SPATIAL" ] && mkdir -p "$OUT_DIR_SPATIAL"
|
||||
|
||||
# fetch/create h5mu from somewhere
|
||||
cat > /tmp/params_create_h5mu.yaml <<EOF
|
||||
param_list:
|
||||
- id: sample_one
|
||||
input_id: sample_one
|
||||
input: s3://openpipelines-data/10x_5k_anticmv/5k_human_antiCMV_T_TBNK_connect_qc.h5mu
|
||||
- id: sample_two
|
||||
input_id: sample_two
|
||||
input: s3://openpipelines-data/10x_5k_anticmv/5k_human_antiCMV_T_TBNK_connect_qc.h5mu
|
||||
output: '\$id.qc.h5mu'
|
||||
output_compression: gzip
|
||||
publish_dir: "$OUT_DIR"
|
||||
EOF
|
||||
|
||||
# add the sample ID to the mudata object
|
||||
nextflow run openpipelines-bio/openpipeline \
|
||||
-latest \
|
||||
-r 2.1.2 \
|
||||
-main-script target/nextflow/metadata/add_id/main.nf \
|
||||
-c src/configs/labels_ci.config \
|
||||
-profile docker \
|
||||
-params-file /tmp/params_create_h5mu.yaml \
|
||||
-resume
|
||||
|
||||
cat > /tmp/params_subset.yaml <<EOF
|
||||
param_list:
|
||||
- id: sample_one
|
||||
input: resources_test/qc_sample_data/sample_one.qc.h5mu
|
||||
- id: sample_two
|
||||
input: resources_test/qc_sample_data/sample_two.qc.h5mu
|
||||
output: '\$id.qc.h5mu'
|
||||
number_of_observations: 10000
|
||||
output_compression: gzip
|
||||
publish_dir: "$OUT_DIR"
|
||||
EOF
|
||||
|
||||
# subset h5mus
|
||||
nextflow run openpipelines-bio/openpipeline \
|
||||
-latest \
|
||||
-r 2.1.2 \
|
||||
-main-script target/nextflow/filter/subset_h5mu/main.nf \
|
||||
-c src/configs/labels_ci.config \
|
||||
-profile docker \
|
||||
-params-file /tmp/params_subset.yaml \
|
||||
-resume
|
||||
|
||||
cat > /tmp/add_metadata_obs.py <<EOF
|
||||
import mudata as mu
|
||||
import glob
|
||||
import numpy as np
|
||||
import pandas as pd
|
||||
import os
|
||||
|
||||
# Directory containing the h5mu files
|
||||
out_dir = "$(pwd)/resources_test/qc_sample_data"
|
||||
|
||||
# List of h5mu files
|
||||
h5mu_files = glob.glob(os.path.join(out_dir, "*.h5mu"))
|
||||
print(f"Found {len(h5mu_files)} h5mu files: {h5mu_files}")
|
||||
|
||||
# Metadata values to randomly assign
|
||||
donor_ids = ["donor_1", "donor_2", "donor_3"]
|
||||
cell_types = ["CD4+ T cell", "CD8+ T cell", "B cell", "NK cell", "Monocyte"]
|
||||
batches = ["batch_A", "batch_B"]
|
||||
conditions = ["treated", "control"]
|
||||
|
||||
for h5mu_file in h5mu_files:
|
||||
print(f"Processing {h5mu_file}...")
|
||||
|
||||
# Load MuData object
|
||||
mdata = mu.read_h5mu(h5mu_file)
|
||||
rna = mdata.mod["rna"]
|
||||
n_obs = rna.n_obs
|
||||
|
||||
# Generate random metadata
|
||||
np.random.seed(42 + hash(h5mu_file) % 100) # Different seed for each file but reproducible
|
||||
|
||||
# Create metadata
|
||||
rna.obs["donor_id"] = np.random.choice(donor_ids, size=n_obs)
|
||||
rna.obs["cell_type"] = np.random.choice(cell_types, size=n_obs)
|
||||
rna.obs["batch"] = np.random.choice(batches, size=n_obs)
|
||||
rna.obs["condition"] = np.random.choice(conditions, size=n_obs)
|
||||
|
||||
# Add a continuous variable too
|
||||
rna.obs["quality_score"] = np.random.uniform(0, 1, size=n_obs)
|
||||
|
||||
# Save the modified MuData object
|
||||
mu.write_h5mu(h5mu_file, mdata)
|
||||
print(f"Added metadata to {h5mu_file}")
|
||||
|
||||
print("All files processed successfully!")
|
||||
EOF
|
||||
|
||||
# Execute the Python script
|
||||
python /tmp/add_metadata_obs.py
|
||||
|
||||
# generate cellbender out for testing
|
||||
cat > /tmp/params_cellbender.yaml <<EOF
|
||||
param_list:
|
||||
- id: sample_one
|
||||
input: resources_test/qc_sample_data/sample_one.qc.h5mu
|
||||
- id: sample_two
|
||||
input: resources_test/qc_sample_data/sample_two.qc.h5mu
|
||||
output: '\$id.qc.cellbender.h5mu'
|
||||
epochs: 5
|
||||
output_compression: gzip
|
||||
publish_dir: "$OUT_DIR"
|
||||
EOF
|
||||
|
||||
nextflow run openpipelines-bio/openpipeline \
|
||||
-latest \
|
||||
-r 2.1.2 \
|
||||
-main-script target/nextflow/correction/cellbender_remove_background/main.nf \
|
||||
-c src/configs/labels_ci.config \
|
||||
-profile docker \
|
||||
-params-file /tmp/params_cellbender.yaml \
|
||||
-resume
|
||||
|
||||
# fetch spatial sample data from s3
|
||||
aws s3 sync \
|
||||
--profile di \
|
||||
s3://openpipelines-bio/openpipeline_incubator/resources_test/spatial_qc_sample_data \
|
||||
"$OUT_DIR_SPATIAL"
|
||||
|
||||
# generate json for testing
|
||||
viash run src/ingestion_qc/h5mu_to_qc_json/config.vsh.yaml --engine docker -- \
|
||||
--input "$OUT_DIR"/sample_one.qc.cellbender.h5mu \
|
||||
--input "$OUT_DIR"/sample_two.qc.cellbender.h5mu \
|
||||
--ingestion_method cellranger_multi \
|
||||
--obs_metadata "donor_id;cell_type;batch;condition" \
|
||||
--output "$OUT_DIR"/sc_dataset.json \
|
||||
--output_reporting_json "$OUT_DIR"/sc_report_structure.json
|
||||
|
||||
viash run src/ingestion_qc/h5mu_to_qc_json/config.vsh.yaml --engine docker -- \
|
||||
--input "$OUT_DIR_SPATIAL"/xenium_tiny.qc.h5mu \
|
||||
--input "$OUT_DIR_SPATIAL"/xenium_tiny.qc.h5mu \
|
||||
--ingestion_method xenium \
|
||||
--min_num_nonzero_vars 1 \
|
||||
--output "$OUT_DIR_SPATIAL"/xenium_dataset.json \
|
||||
--output_reporting_json "$OUT_DIR_SPATIAL"/xenium_report_structure.json
|
||||
|
||||
# remove all state yaml files
|
||||
rm "$OUT_DIR"/*.yaml
|
||||
rm "$OUT_DIR_SPATIAL"/*.yaml
|
||||
|
||||
# copy to s3
|
||||
aws s3 sync \
|
||||
"$OUT_DIR" \
|
||||
s3://openpipelines-bio/openpipeline_incubator/"$OUT_DIR" \
|
||||
--delete \
|
||||
--dryrun
|
||||
|
||||
|
||||
aws s3 sync \
|
||||
"$OUT_DIR_SPATIAL" \
|
||||
s3://openpipelines-bio/openpipeline_incubator/"$OUT_DIR_SPATIAL" \
|
||||
--delete \
|
||||
--dryrun
|
||||
37
resources_test_scripts/spatial_qc_sample_data.sh
Executable file
37
resources_test_scripts/spatial_qc_sample_data.sh
Executable file
@@ -0,0 +1,37 @@
|
||||
#/bin/bash
|
||||
|
||||
OUT_DIR=resources_test/spatial_qc_sample_data
|
||||
|
||||
[ ! -d "$OUT_DIR" ] && mkdir -p "$OUT_DIR"
|
||||
|
||||
# fetch/create h5mu from somewhere
|
||||
cat > /tmp/qc.yaml <<EOF
|
||||
param_list:
|
||||
- id: xenium_tiny
|
||||
input: s3://openpipelines-bio/openpipeline_spatial/resources_test/xenium/xenium_tiny.h5mu
|
||||
- id: Lung5_Rep2_tiny
|
||||
input: s3://openpipelines-bio/openpipeline_spatial/resources_test/cosmx/Lung5_Rep2_tiny.h5mu
|
||||
var_name_mitochondrial_genes: mitochondrial
|
||||
var_name_ribosomal_genes: ribosomal
|
||||
output: '\$id.qc.h5mu'
|
||||
output_compression: gzip
|
||||
publish_dir: "$OUT_DIR"
|
||||
EOF
|
||||
|
||||
nextflow run openpipelines-bio/openpipeline \
|
||||
-latest \
|
||||
-r 2.1.0 \
|
||||
-main-script target/nextflow/workflows/qc/qc/main.nf \
|
||||
-profile docker \
|
||||
-params-file /tmp/qc.yaml \
|
||||
-resume \
|
||||
-config src/configs/labels_ci.config
|
||||
|
||||
# copy to s3
|
||||
aws s3 sync \
|
||||
--profile di \
|
||||
resources_test/spatial_qc_sample_data \
|
||||
s3://openpipelines-bio/openpipeline_incubator/resources_test/spatial_qc_sample_data \
|
||||
--delete --dryrun \
|
||||
--exclude "*" --include "*.h5mu" \
|
||||
|
||||
11
src/authors/dorien_roosen.yaml
Normal file
11
src/authors/dorien_roosen.yaml
Normal file
@@ -0,0 +1,11 @@
|
||||
name: Dorien Roosen
|
||||
info:
|
||||
role: Core Team Member
|
||||
links:
|
||||
email: dorien@data-intuitive.com
|
||||
github: dorien-er
|
||||
linkedin: dorien-roosen
|
||||
organizations:
|
||||
- name: Data Intuitive
|
||||
href: https://www.data-intuitive.com
|
||||
role: Data Scientist
|
||||
11
src/authors/jakub_majercik.yaml
Normal file
11
src/authors/jakub_majercik.yaml
Normal file
@@ -0,0 +1,11 @@
|
||||
name: Jakub Majercik
|
||||
info:
|
||||
role: Contributor
|
||||
links:
|
||||
email: jakub@data-intuitive.com
|
||||
github: jakubmajercik
|
||||
linkedin: jakubmajercik
|
||||
organizations:
|
||||
- name: Data Intuitive
|
||||
href: https://www.data-intuitive.com
|
||||
role: Bioinformatics Engineer
|
||||
15
src/authors/robrecht_cannoodt.yaml
Normal file
15
src/authors/robrecht_cannoodt.yaml
Normal file
@@ -0,0 +1,15 @@
|
||||
name: Robrecht Cannoodt
|
||||
info:
|
||||
role: Core Team Member
|
||||
links:
|
||||
email: robrecht@data-intuitive.com
|
||||
github: rcannood
|
||||
orcid: "0000-0003-3641-729X"
|
||||
linkedin: robrechtcannoodt
|
||||
organizations:
|
||||
- name: Data Intuitive
|
||||
href: https://www.data-intuitive.com
|
||||
role: Data Science Engineer
|
||||
- name: Open Problems
|
||||
href: https://openproblems.bio
|
||||
role: Core Member
|
||||
6
src/authors/weiwei_schultz.yaml
Normal file
6
src/authors/weiwei_schultz.yaml
Normal file
@@ -0,0 +1,6 @@
|
||||
name: Weiwei Schultz
|
||||
info:
|
||||
role: Contributor
|
||||
organizations:
|
||||
- name: Janssen R&D US
|
||||
role: Associate Director Data Sciences
|
||||
2
src/base/requirements/anndata.yaml
Normal file
2
src/base/requirements/anndata.yaml
Normal file
@@ -0,0 +1,2 @@
|
||||
packages:
|
||||
- anndata~=0.11.1
|
||||
9
src/base/requirements/anndata_mudata.yaml
Normal file
9
src/base/requirements/anndata_mudata.yaml
Normal file
@@ -0,0 +1,9 @@
|
||||
__merge__: [/src/base/requirements/anndata.yaml, .]
|
||||
packages:
|
||||
- mudata~=0.3.1
|
||||
# Make sure that awkward is not installed. Currently, support of awkward arrays
|
||||
# in anndata is experimental, and it is enabled based on whether or not the package
|
||||
# is available. By making sure that awkward is not installed, the functionality is
|
||||
# not enabled.
|
||||
script: |
|
||||
exec("try:\n import awkward\nexcept ModuleNotFoundError:\n exit(0)\nelse: exit(1)")
|
||||
2
src/base/requirements/viashpy.yaml
Normal file
2
src/base/requirements/viashpy.yaml
Normal file
@@ -0,0 +1,2 @@
|
||||
packages:
|
||||
- viashpy==0.8.0
|
||||
36
src/configs/integration_tests.config
Normal file
36
src/configs/integration_tests.config
Normal file
@@ -0,0 +1,36 @@
|
||||
profiles {
|
||||
|
||||
// detect tempdir
|
||||
tempDir = java.nio.file.Paths.get(
|
||||
System.getenv('NXF_TEMP') ?:
|
||||
System.getenv('VIASH_TEMP') ?:
|
||||
System.getenv('TEMPDIR') ?:
|
||||
System.getenv('TMPDIR') ?:
|
||||
'/tmp'
|
||||
).toAbsolutePath()
|
||||
|
||||
mount_temp {
|
||||
docker.temp = tempDir
|
||||
podman.temp = tempDir
|
||||
charliecloud.temp = tempDir
|
||||
}
|
||||
|
||||
no_publish {
|
||||
process {
|
||||
withName: '.*' {
|
||||
publishDir = [
|
||||
enabled: false
|
||||
]
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
docker {
|
||||
docker.enabled = true
|
||||
// docker.userEmulation = true
|
||||
singularity.enabled = false
|
||||
podman.enabled = false
|
||||
shifter.enabled = false
|
||||
charliecloud.enabled = false
|
||||
}
|
||||
}
|
||||
66
src/configs/labels.config
Normal file
66
src/configs/labels.config
Normal file
@@ -0,0 +1,66 @@
|
||||
process {
|
||||
// Default resources for components that hardly do any processing
|
||||
memory = { 2.GB * task.attempt }
|
||||
cpus = 1
|
||||
|
||||
// Retry for exit codes that have something to do with memory issues
|
||||
errorStrategy = { task.exitStatus in 137..140 ? 'retry' : 'terminate' }
|
||||
maxRetries = 3
|
||||
maxMemory = null
|
||||
|
||||
// CPU resources
|
||||
withLabel: singlecpu { cpus = 1 }
|
||||
withLabel: lowcpu { cpus = 4 }
|
||||
withLabel: midcpu { cpus = 10 }
|
||||
withLabel: highcpu { cpus = 20 }
|
||||
|
||||
// Memory resources
|
||||
withLabel: verylowmem { memory = { get_memory( 4.GB * task.attempt ) } }
|
||||
withLabel: lowmem { memory = { get_memory( 8.GB * task.attempt ) } }
|
||||
withLabel: midmem { memory = { get_memory( 16.GB * task.attempt ) } }
|
||||
withLabel: highmem { memory = { get_memory( 64.GB * task.attempt ) } }
|
||||
withLabel: veryhighmem { memory = { get_memory( 75.GB * task.attempt ) } }
|
||||
|
||||
// Disk space
|
||||
withLabel: lowdisk {
|
||||
disk = {process.disk ? process.disk : null}
|
||||
}
|
||||
withLabel: middisk {
|
||||
disk = {process.disk ? process.disk : null}
|
||||
}
|
||||
withLabel: highdisk {
|
||||
disk = {process.disk ? process.disk : null}
|
||||
}
|
||||
withLabel: veryhighdisk {
|
||||
disk = {process.disk ? process.disk : null}
|
||||
}
|
||||
// NOTE: The above labels intentionally do not have an effect by default.
|
||||
// The user should set the disk space requirements by adding the following
|
||||
// to the compute environment:
|
||||
//
|
||||
// withLabel: lowdisk { disk = { 20.GB * task.attempt } }
|
||||
// withLabel: middisk { disk = { 100.GB * task.attempt } }
|
||||
// withLabel: highdisk { disk = { 200.GB * task.attempt } }
|
||||
// withLabel: veryhighdisk { disk = { 500.GB * task.attempt } }
|
||||
}
|
||||
|
||||
def get_memory(to_compare) {
|
||||
if (!process.containsKey("maxMemory") || !process.maxMemory) {
|
||||
return to_compare
|
||||
}
|
||||
|
||||
try {
|
||||
if (process.containsKey("maxRetries") && process.maxRetries && task.attempt == (process.maxRetries as int)) {
|
||||
return process.maxMemory
|
||||
}
|
||||
else if (to_compare.compareTo(process.maxMemory as nextflow.util.MemoryUnit) == 1) {
|
||||
return max_memory as nextflow.util.MemoryUnit
|
||||
}
|
||||
else {
|
||||
return to_compare
|
||||
}
|
||||
} catch (all) {
|
||||
println "Error processing memory resources. Please check that process.maxMemory '${process.maxMemory}' and process.maxRetries '${process.maxRetries}' are valid!"
|
||||
System.exit(1)
|
||||
}
|
||||
}
|
||||
105
src/configs/labels_ci.config
Normal file
105
src/configs/labels_ci.config
Normal file
@@ -0,0 +1,105 @@
|
||||
process {
|
||||
withLabel: lowmem { memory = 13.Gb }
|
||||
withLabel: lowcpu { cpus = 4 }
|
||||
withLabel: midmem { memory = 13.Gb }
|
||||
withLabel: midcpu { cpus = 4 }
|
||||
withLabel: highmem { memory = 13.Gb }
|
||||
withLabel: highcpu { cpus = 4 }
|
||||
withLabel: veryhighmem { memory = 13.Gb }
|
||||
withLabel: lowdisk {
|
||||
disk = {process.disk ? process.disk : null}
|
||||
}
|
||||
withLabel: middisk {
|
||||
disk = {process.disk ? process.disk : null}
|
||||
}
|
||||
withLabel: highdisk {
|
||||
disk = {process.disk ? process.disk : null}
|
||||
}
|
||||
withLabel: veryhighdisk {
|
||||
disk = {process.disk ? process.disk : null}
|
||||
}
|
||||
}
|
||||
|
||||
env.NUMBA_CACHE_DIR = '/tmp'
|
||||
|
||||
trace {
|
||||
enabled = true
|
||||
overwrite = true
|
||||
}
|
||||
dag {
|
||||
overwrite = true
|
||||
}
|
||||
|
||||
process.maxForks = 1
|
||||
|
||||
profiles {
|
||||
// detect tempdir
|
||||
tempDir = java.nio.file.Paths.get(
|
||||
System.getenv('NXF_TEMP') ?:
|
||||
System.getenv('VIASH_TEMP') ?:
|
||||
System.getenv('TEMPDIR') ?:
|
||||
System.getenv('TMPDIR') ?:
|
||||
'/tmp'
|
||||
).toAbsolutePath()
|
||||
|
||||
mount_temp {
|
||||
docker.temp = tempDir
|
||||
podman.temp = tempDir
|
||||
charliecloud.temp = tempDir
|
||||
}
|
||||
|
||||
no_publish {
|
||||
process {
|
||||
withName: '.*' {
|
||||
publishDir = [
|
||||
enabled: false
|
||||
]
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
docker {
|
||||
docker.fixOwnership = true
|
||||
docker.enabled = true
|
||||
singularity.enabled = false
|
||||
podman.enabled = false
|
||||
shifter.enabled = false
|
||||
charliecloud.enabled = false
|
||||
}
|
||||
|
||||
local {
|
||||
// This config is for local processing.
|
||||
process {
|
||||
maxMemory = 25.GB
|
||||
withLabel: verylowcpu { cpus = 2 }
|
||||
withLabel: lowcpu { cpus = 4 }
|
||||
withLabel: midcpu { cpus = 6 }
|
||||
withLabel: highcpu { cpus = 12 }
|
||||
|
||||
withLabel: lowmem { memory = { get_memory( 8.GB * task.attempt ) } }
|
||||
withLabel: midmem { memory = { get_memory( 12.GB * task.attempt ) } }
|
||||
withLabel: highmem { memory = { get_memory( 20.GB * task.attempt ) } }
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
def get_memory(to_compare) {
|
||||
if (!process.containsKey("maxMemory") || !process.maxMemory) {
|
||||
return to_compare
|
||||
}
|
||||
|
||||
try {
|
||||
if (process.containsKey("maxRetries") && process.maxRetries && task.attempt == (process.maxRetries as int)) {
|
||||
return process.maxMemory
|
||||
}
|
||||
else if (to_compare.compareTo(process.maxMemory as nextflow.util.MemoryUnit) == 1) {
|
||||
return max_memory as nextflow.util.MemoryUnit
|
||||
}
|
||||
else {
|
||||
return to_compare
|
||||
}
|
||||
} catch (all) {
|
||||
println "Error processing memory resources. Please check that process.maxMemory '${process.maxMemory}' and process.maxRetries '${process.maxRetries}' are valid!"
|
||||
System.exit(1)
|
||||
}
|
||||
}
|
||||
67
src/ingestion_qc/detect_ingestion_method/config.vsh.yaml
Normal file
67
src/ingestion_qc/detect_ingestion_method/config.vsh.yaml
Normal file
@@ -0,0 +1,67 @@
|
||||
name: detect_ingestion_method
|
||||
namespace: ingestion_qc
|
||||
description: |
|
||||
Detects the ingestion method of a dataset.
|
||||
Currently detects either 10X CellRanger Multi, 10X Xenium or Nanostring CosMx, but can be extended to other technologies upon request.
|
||||
authors:
|
||||
- __merge__: /src/authors/dorien_roosen.yaml
|
||||
roles: [author]
|
||||
- __merge__: /src/authors/weiwei_schultz.yaml
|
||||
roles: [contributor]
|
||||
argument_groups:
|
||||
- name: Inputs
|
||||
arguments:
|
||||
- name: --input
|
||||
type: file
|
||||
required: true
|
||||
direction: input
|
||||
description: The input h5mu file(s)
|
||||
example: path/to/file.h5mu
|
||||
- name: --modality
|
||||
type: string
|
||||
description: The modality to use
|
||||
default: rna
|
||||
- name: Outputs
|
||||
arguments:
|
||||
- name: --output_uns_ingestion_method
|
||||
description: The .uns field in which to store the exprimental setup. Values stored are `cellranger_multi`, `xenium` or `cosmx`.
|
||||
type: string
|
||||
default: ingestion_method
|
||||
- name: --output
|
||||
type: file
|
||||
required: true
|
||||
direction: output
|
||||
description: The output h5mu file, containing an .uns field with experiment description.
|
||||
example: path/to/file.h5mu
|
||||
resources:
|
||||
- type: python_script
|
||||
path: script.py
|
||||
- path: /src/utils/setup_logger.py
|
||||
test_resources:
|
||||
- type: python_script
|
||||
path: test.py
|
||||
- path: /resources_test/qc_sample_data/sample_one.qc.h5mu
|
||||
- path: /resources_test/spatial_qc_sample_data/Lung5_Rep2_tiny.qc.h5mu
|
||||
- path: /resources_test/spatial_qc_sample_data/xenium_tiny.qc.h5mu
|
||||
engines:
|
||||
- type: docker
|
||||
image: python:3.12-slim
|
||||
setup:
|
||||
- type: apt
|
||||
packages:
|
||||
- procps
|
||||
- type: python
|
||||
__merge__: [/src/base/requirements/anndata_mudata.yaml]
|
||||
test_setup:
|
||||
- type: apt
|
||||
packages:
|
||||
- git
|
||||
- type: python
|
||||
__merge__: [/src/base/requirements/viashpy.yaml]
|
||||
github: openpipelines-bio/core#subdirectory=packages/python/openpipeline_testutils
|
||||
|
||||
runners:
|
||||
- type: executable
|
||||
- type: nextflow
|
||||
directives:
|
||||
label: [lowmem, lowdisk]
|
||||
69
src/ingestion_qc/detect_ingestion_method/script.py
Normal file
69
src/ingestion_qc/detect_ingestion_method/script.py
Normal file
@@ -0,0 +1,69 @@
|
||||
import shutil
|
||||
import anndata as ad
|
||||
import h5py
|
||||
import sys
|
||||
|
||||
## VIASH START
|
||||
par = {
|
||||
"input": "resources_test/qc_sample_data/sample_one.qc.h5mu",
|
||||
# "input": "resources_test/spatial_qc_sample_data/xenium/xenium_tiny_qc.h5mu",
|
||||
# "input": "/Users/dorienroosen/code/openpipeline_spatial/resources_test/cosmx/Lung5_Rep2_tiny.h5mu",
|
||||
"output": "output.h5mu",
|
||||
"output_uns_ingestion_method": "ingestion_method",
|
||||
"modality": "rna"
|
||||
}
|
||||
meta = {
|
||||
"resources_dir": "src/utils"
|
||||
}
|
||||
## VIASH END
|
||||
|
||||
sys.path.append(meta["resources_dir"])
|
||||
from setup_logger import setup_logger
|
||||
|
||||
logger = setup_logger()
|
||||
|
||||
|
||||
def main(par):
|
||||
|
||||
# read h5mu file
|
||||
with h5py.File(par["input"], "r") as file:
|
||||
mod = file["mod"][par["modality"]]
|
||||
uns = ad.experimental.read_elem(file["uns"])
|
||||
mod_obs = ad.experimental.read_elem(mod["obs"])
|
||||
mod_uns = ad.experimental.read_elem(mod["uns"])
|
||||
|
||||
# detect ingestion method
|
||||
ingestion_methods = {
|
||||
"cellranger_multi": "metrics_cellranger" in uns,
|
||||
"xenium": all(key in mod_obs for key in ["segmentation_method", "nucleus_area"]),
|
||||
"cosmx": "spatial" in mod_uns and "fov" in mod_obs
|
||||
}
|
||||
|
||||
# make sure only one ingestion method is detected
|
||||
detected_methods = [method for method, detected in ingestion_methods.items() if detected]
|
||||
methods_count = len(detected_methods)
|
||||
|
||||
if methods_count == 1:
|
||||
detected_method = detected_methods[0]
|
||||
logger.info(f"Detected ingestion method {detected_method}")
|
||||
|
||||
elif methods_count == 0:
|
||||
raise ValueError("No ingestion method detected")
|
||||
|
||||
else:
|
||||
raise ValueError(f"Multiple ingestion methods detected: {', '.join(detected_methods)}")
|
||||
|
||||
# check if mod_uns already contains a different detected method
|
||||
if mod_uns.get(par["output_uns_ingestion_method"], detected_method) != detected_method:
|
||||
raise ValueError(f"Field .uns['{par['output_uns_ingestion_method']}'] already exists and contains different value `{mod_uns.get(par['output_uns_ingestion_method'])}` than detected method (`{detected_method}`).")
|
||||
|
||||
# copy input to output
|
||||
shutil.copy(par["input"], par["output"])
|
||||
|
||||
if par["output_uns_ingestion_method"] not in mod_uns:
|
||||
with h5py.File (par["output"], "r+") as out_file:
|
||||
out_file["uns"][par["output_uns_ingestion_method"]] = detected_method
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main(par)
|
||||
63
src/ingestion_qc/detect_ingestion_method/test.py
Normal file
63
src/ingestion_qc/detect_ingestion_method/test.py
Normal file
@@ -0,0 +1,63 @@
|
||||
import pytest
|
||||
import h5py
|
||||
import os
|
||||
import anndata as ad
|
||||
import sys
|
||||
|
||||
## VIASH START
|
||||
meta = {
|
||||
"resources_dir": "resources_test"
|
||||
}
|
||||
## VIASH END
|
||||
|
||||
|
||||
def test_cellranger(run_component, tmp_path):
|
||||
output = tmp_path / "output_cellranger.h5mu"
|
||||
|
||||
run_component(
|
||||
[
|
||||
"--input", meta["resources_dir"] + "/sample_one.qc.h5mu",
|
||||
"--output", output
|
||||
]
|
||||
)
|
||||
|
||||
assert os.path.exists(output), "Output file was not created"
|
||||
with h5py.File (output, "r") as out_file:
|
||||
uns = ad.experimental.read_elem(out_file["uns"])
|
||||
assert uns["ingestion_method"] == "cellranger_multi", "cellranger_multi not detected"
|
||||
|
||||
|
||||
def test_xenium(run_component, tmp_path):
|
||||
output = tmp_path / "output_xenium.h5mu"
|
||||
|
||||
run_component(
|
||||
[
|
||||
"--input", meta["resources_dir"] + "/xenium_tiny.qc.h5mu",
|
||||
"--output", output
|
||||
]
|
||||
)
|
||||
|
||||
assert os.path.exists(output), "Output file was not created"
|
||||
with h5py.File (output, "r") as out_file:
|
||||
uns = ad.experimental.read_elem(out_file["uns"])
|
||||
assert uns["ingestion_method"] == "xenium", "xenium not detected"
|
||||
|
||||
|
||||
def test_cosmx(run_component, tmp_path):
|
||||
output = tmp_path / "output_cosmx.h5mu"
|
||||
|
||||
run_component(
|
||||
[
|
||||
"--input", meta["resources_dir"] + "/Lung5_Rep2_tiny.qc.h5mu",
|
||||
"--output", output
|
||||
]
|
||||
)
|
||||
|
||||
assert os.path.exists(output), "Output file was not created"
|
||||
with h5py.File (output, "r") as out_file:
|
||||
uns = ad.experimental.read_elem(out_file["uns"])
|
||||
assert uns["ingestion_method"] == "cosmx", "cosmx not detected"
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
sys.exit(pytest.main([__file__]))
|
||||
64
src/ingestion_qc/generate_html/config.vsh.yaml
Normal file
64
src/ingestion_qc/generate_html/config.vsh.yaml
Normal file
@@ -0,0 +1,64 @@
|
||||
name: generate_html
|
||||
namespace: ingestion_qc
|
||||
description: Generate an HTML report from the QC metrics
|
||||
authors:
|
||||
- __merge__: /src/authors/jakub_majercik.yaml
|
||||
roles: [author]
|
||||
- __merge__: /src/authors/dorien_roosen.yaml
|
||||
roles: [author]
|
||||
- __merge__: /src/authors/robrecht_cannoodt.yaml
|
||||
roles: [author]
|
||||
- __merge__: /src/authors/weiwei_schultz.yaml
|
||||
roles: [contributor]
|
||||
argument_groups:
|
||||
- name: Inputs
|
||||
arguments:
|
||||
- name: --input_data
|
||||
type: file
|
||||
required: true
|
||||
direction: input
|
||||
description: The input JSON file containing the QC metrics
|
||||
example: path/to/file.json
|
||||
- name: --input_structure
|
||||
type: file
|
||||
required: true
|
||||
direction: input
|
||||
description: The input JSON file containing the structure of the data
|
||||
example: path/to/file.json
|
||||
- name: Outputs
|
||||
arguments:
|
||||
- name: --output_qc_report
|
||||
type: file
|
||||
required: true
|
||||
direction: output
|
||||
description: The output HTML report
|
||||
example: path/to/file.html
|
||||
resources:
|
||||
- type: bash_script
|
||||
path: script.sh
|
||||
test_resources:
|
||||
- type: bash_script
|
||||
path: test.sh
|
||||
- path: /resources_test/qc_sample_data/sc_dataset.json
|
||||
- path: /resources_test/qc_sample_data/sc_report_structure.json
|
||||
- path: /resources_test/spatial_qc_sample_data/xenium_dataset.json
|
||||
- path: /resources_test/spatial_qc_sample_data/xenium_report_structure.json
|
||||
engines:
|
||||
- type: docker
|
||||
image: node:latest
|
||||
setup:
|
||||
- type: apt
|
||||
packages:
|
||||
- git
|
||||
- type: docker
|
||||
run: |
|
||||
npm install -g pnpm@latest-10 \
|
||||
&& cd /opt && git clone -b v0.1.0 https://github.com/openpipelines-bio/siqc.git \
|
||||
&& cd siqc && pnpm install \
|
||||
&& true
|
||||
|
||||
runners:
|
||||
- type: executable
|
||||
- type: nextflow
|
||||
directives:
|
||||
label: [lowmem, lowdisk]
|
||||
18
src/ingestion_qc/generate_html/script.sh
Executable file
18
src/ingestion_qc/generate_html/script.sh
Executable file
@@ -0,0 +1,18 @@
|
||||
ABSOLUTE_INPUT_DATA=$(realpath $par_input_data)
|
||||
ABSOLUTE_INPUT_STRUCTURE=$(realpath $par_input_structure)
|
||||
ABSOLUTE_OUTPUT=$(realpath $par_output_qc_report)
|
||||
|
||||
cd /opt/siqc
|
||||
mkdir src/data
|
||||
|
||||
echo "Compressing input data..."
|
||||
pnpm run compress_data "$ABSOLUTE_INPUT_DATA" "src/data/dataset.ts"
|
||||
|
||||
echo "Compressing report structure..."
|
||||
pnpm run compress_data "$ABSOLUTE_INPUT_STRUCTURE" "src/data/report_structure.ts"
|
||||
|
||||
echo "Generating HTML..."
|
||||
pnpm run build
|
||||
|
||||
echo "Copying HTML to output directory..."
|
||||
cp dist/index.html "$ABSOLUTE_OUTPUT"
|
||||
10
src/ingestion_qc/generate_html/test.sh
Normal file
10
src/ingestion_qc/generate_html/test.sh
Normal file
@@ -0,0 +1,10 @@
|
||||
echo ">> Generating report"
|
||||
"$meta_executable" \
|
||||
--input_data "$meta_resources_dir/sc_dataset.json" \
|
||||
--input_structure "$meta_resources_dir/sc_report_structure.json" \
|
||||
--output_qc_report "index.html" \
|
||||
|
||||
echo ">> Checking output"
|
||||
[ ! -f "index.html" ] && echo "Error: Output report does not exist." && exit 1
|
||||
|
||||
echo ">> Test succesful" && exit 0
|
||||
168
src/ingestion_qc/h5mu_to_qc_json/config.vsh.yaml
Normal file
168
src/ingestion_qc/h5mu_to_qc_json/config.vsh.yaml
Normal file
@@ -0,0 +1,168 @@
|
||||
name: h5mu_to_qc_json
|
||||
namespace: ingestion_qc
|
||||
scope: private
|
||||
description: |
|
||||
Takes H5MU files that have been ingested by CellRanger, Xenium or CosMx and processed by the QC workflow, and generates:
|
||||
- A JSON file that contains the combined data for the QC report
|
||||
- A JSON file that defines the layout and structure of the QC report
|
||||
authors:
|
||||
- __merge__: /src/authors/jakub_majercik.yaml
|
||||
roles: [author]
|
||||
- __merge__: /src/authors/dorien_roosen.yaml
|
||||
roles: [author]
|
||||
- __merge__: /src/authors/robrecht_cannoodt.yaml
|
||||
roles: [author]
|
||||
- __merge__: /src/authors/weiwei_schultz.yaml
|
||||
roles: [contributor]
|
||||
argument_groups:
|
||||
- name: Inputs
|
||||
arguments:
|
||||
- name: --input
|
||||
type: file
|
||||
multiple: true
|
||||
required: true
|
||||
direction: input
|
||||
description: The input h5mu file(s)
|
||||
example: path/to/file.h5mu
|
||||
- name: --modality
|
||||
type: string
|
||||
description: The modality to use
|
||||
default: rna
|
||||
- name: --ingestion_method
|
||||
type: string
|
||||
required: true
|
||||
choices:
|
||||
- cellranger_multi
|
||||
- xenium
|
||||
description: Method that was used to ingest the data - this will define the structure of the report that is generated.
|
||||
- name: --obs_sample_id
|
||||
type: string
|
||||
description: The key in the h5mu file that contains the sample ID. If not provided, each H5MU file will be considered as a separate sample.
|
||||
default: sample_id
|
||||
- name: --obs_total_counts
|
||||
type: string
|
||||
description: The key in the h5mu .obs field that contains the total counts.
|
||||
default: total_counts
|
||||
- name: --obs_num_nonzero_vars
|
||||
type: string
|
||||
description: The key in the h5mu .obs field that contains the number of nonzero vars.
|
||||
default: num_nonzero_vars
|
||||
- name: --obs_fraction_mitochondrial
|
||||
type: string
|
||||
description: The key in the h5mu .obs field that contains the fraction mitochondrial genes.
|
||||
default: fraction_mitochondrial
|
||||
- name: --obs_fraction_ribosomal
|
||||
type: string
|
||||
description: The key in the h5mu .obs field that contains the fraction ribosomal genes.
|
||||
default: fraction_ribosomal
|
||||
|
||||
- name: Outputs
|
||||
arguments:
|
||||
- name: --output
|
||||
type: file
|
||||
required: true
|
||||
direction: output
|
||||
description: The output JSON file
|
||||
example: path/to/file.json
|
||||
- name: --output_reporting_json
|
||||
type: file
|
||||
required: true
|
||||
description: The output JSON file that defines the QC report
|
||||
direction: output
|
||||
example: path/to/file.json
|
||||
|
||||
- name: Filtering & grouping options
|
||||
arguments:
|
||||
- name: --min_total_counts
|
||||
type: integer
|
||||
description: Minimum total counts for a cell to be included in the output
|
||||
default: 10
|
||||
- name: --min_num_nonzero_vars
|
||||
type: integer
|
||||
description: Minimum number of nonzero vars for a cell to be included in the output
|
||||
default: 10
|
||||
- name: --obs_metadata
|
||||
type: string
|
||||
multiple: true
|
||||
description: The metadata keys in the h5mu .obs to include in the output JSON.
|
||||
example: "donor_id;cell_type;batch;condition"
|
||||
|
||||
- name: Options for CellRanger reports
|
||||
arguments:
|
||||
- name: --obs_cellbender
|
||||
type: string
|
||||
multiple: true
|
||||
description: The cellbender keys in the h5mu .obs to include in the output JSON
|
||||
default: [
|
||||
"cellbender_background_fraction",
|
||||
"cellbender_cell_probability",
|
||||
"cellbender_cell_size",
|
||||
"cellbender_droplet_efficiency"
|
||||
]
|
||||
- name: --uns_cellranger_metrics
|
||||
type: string
|
||||
description: The key in the h5mu file .uns that contains the cellranger metrics
|
||||
default: metrics_cellranger
|
||||
|
||||
|
||||
- name: Options for Xenium reports
|
||||
arguments:
|
||||
- name: --obs_nucleus_area
|
||||
type: string
|
||||
description: The key in the h5mu .obs field that contains the nucleus area.
|
||||
default: nucleus_area
|
||||
- name: --obs_cell_area
|
||||
type: string
|
||||
description: The key in the h5mu .obs field that contains the cell area.
|
||||
default: cell_area
|
||||
- name: --obs_x_coord
|
||||
type: string
|
||||
description: The key in the h5mu .obs field that contains the x coordinate.
|
||||
default: x_coord
|
||||
- name: --obs_y_coord
|
||||
type: string
|
||||
description: The key in the h5mu .obs field that contains the y coordinate.
|
||||
default: y_coord
|
||||
- name: --obs_control_probe_counts
|
||||
type: string
|
||||
description: The key in the h5mu .obs field that contains the number of control probes.
|
||||
default: control_probe_counts
|
||||
- name: --obs_control_codeword_counts
|
||||
type: string
|
||||
description: The key in the h5mu .obs field that contains the number of control codewords.
|
||||
default: control_codeword_counts
|
||||
|
||||
# - name: Options for CosMx reports
|
||||
|
||||
resources:
|
||||
- type: python_script
|
||||
path: script.py
|
||||
- path: /src/utils/setup_logger.py
|
||||
- path: report_structure
|
||||
test_resources:
|
||||
- type: python_script
|
||||
path: test.py
|
||||
- type: file
|
||||
path: /resources_test
|
||||
engines:
|
||||
- type: docker
|
||||
image: python:3.12-slim
|
||||
setup:
|
||||
- type: apt
|
||||
packages:
|
||||
- procps
|
||||
- type: python
|
||||
__merge__: [ /src/base/requirements/anndata_mudata.yaml ]
|
||||
test_setup:
|
||||
- type: apt
|
||||
packages:
|
||||
- git
|
||||
- type: python
|
||||
__merge__: [/src/base/requirements/viashpy.yaml]
|
||||
github: openpipelines-bio/core#subdirectory=packages/python/openpipeline_testutils
|
||||
|
||||
runners:
|
||||
- type: executable
|
||||
- type: nextflow
|
||||
directives:
|
||||
label: [midmem, middisk]
|
||||
@@ -0,0 +1,162 @@
|
||||
{
|
||||
"categories": [
|
||||
{
|
||||
"name": "Sample QC",
|
||||
"key": "sample_summary_stats",
|
||||
"additionalAxes": false,
|
||||
"defaultFilters": []
|
||||
},
|
||||
{
|
||||
"name": "SampleQC",
|
||||
"key": "metrics_cellranger_stats",
|
||||
"additionalAxes": false,
|
||||
"defaultFilters": [
|
||||
{
|
||||
"type": "bar",
|
||||
"field": "Number_of_reads_in_the_library",
|
||||
"label": "Number of reads per library",
|
||||
"description": "Sequencing depth per sample. Higher values generally indicate more comprehensive cell profiling.",
|
||||
"nBins": 10,
|
||||
"groupBy": "sample_id",
|
||||
"xAxisType": "linear",
|
||||
"yAxisType": "linear"
|
||||
},
|
||||
{
|
||||
"type": "bar",
|
||||
"field": "Confidently_mapped_reads_in_cells",
|
||||
"label": "Confidently mapped reads in cells",
|
||||
"description": "Number of reads that were mapped unambiguously to the reference genome within cell-containing droplets.",
|
||||
"groupBy": "sample_id",
|
||||
"nBins": 10,
|
||||
"yAxisType": "linear"
|
||||
},
|
||||
{
|
||||
"type": "bar",
|
||||
"field": "Estimated_number_of_cells",
|
||||
"label": "Estimated number of cells",
|
||||
"description": "CellRanger's estimate of the number of cells per sample based on the UMI count distribution.",
|
||||
"groupBy": "sample_id",
|
||||
"nBins": 10,
|
||||
"yAxisType": "linear"
|
||||
},
|
||||
{
|
||||
"type": "bar",
|
||||
"field": "Sequencing_saturation",
|
||||
"label": "Sequencing saturation",
|
||||
"description": "Fraction of reads that are duplicates of existing UMIs. Higher values suggest deeper sequencing coverage.",
|
||||
"groupBy": "sample_id",
|
||||
"nBins": 10,
|
||||
"yAxisType": "linear"
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"name": "Cell RNA QC",
|
||||
"key": "cell_rna_stats",
|
||||
"additionalAxes": true,
|
||||
"defaultFilters": [
|
||||
{
|
||||
"type": "histogram",
|
||||
"field": "total_counts",
|
||||
"label": "Total UMI per cell",
|
||||
"description": "Total number of RNA molecules detected per cell. Low values typically indicate empty droplets or low-quality cells that should be filtered out.",
|
||||
"cutoffMin": null,
|
||||
"cutoffMax": null,
|
||||
"zoomMax": null,
|
||||
"nBins": 50,
|
||||
"groupBy": "sample_id",
|
||||
"yAxisType": "linear"
|
||||
},
|
||||
{
|
||||
"type": "histogram",
|
||||
"field": "num_nonzero_vars",
|
||||
"label": "Number of non-zero genes per cell",
|
||||
"description": "Count of unique genes detected in each cell. Low gene counts often indicate poor-quality cells.",
|
||||
"cutoffMin": null,
|
||||
"cutoffMax": null,
|
||||
"zoomMax": null,
|
||||
"nBins": 50,
|
||||
"groupBy": "sample_id",
|
||||
"yAxisType": "linear"
|
||||
},
|
||||
{
|
||||
"type": "histogram",
|
||||
"field": "fraction_mitochondrial",
|
||||
"label": "Fraction UMI of mitochondrial genes per cell",
|
||||
"description": "Proportion of cell's RNA from mitochondrial genes.",
|
||||
"cutoffMin": null,
|
||||
"cutoffMax": null,
|
||||
"nBins": 50,
|
||||
"groupBy": "sample_id",
|
||||
"yAxisType": "linear"
|
||||
},
|
||||
{
|
||||
"type": "histogram",
|
||||
"field": "fraction_ribosomal",
|
||||
"label": "Fraction UMI of ribosomal genes per cell",
|
||||
"description": "Proportion of cell's RNA from ribosomal protein genes. Extreme values may indicate stress responses or cell cycle abnormalities.",
|
||||
"cutoffMin": null,
|
||||
"cutoffMax": null,
|
||||
"nBins": 50,
|
||||
"groupBy": "sample_id",
|
||||
"yAxisType": "linear"
|
||||
},
|
||||
{
|
||||
"type": "histogram",
|
||||
"field": "pct_of_counts_in_top_50_vars",
|
||||
"label": "Fraction UMI in top 50 genes per cell",
|
||||
"description": "Proportion of RNA molecules from the 50 most-expressed genes in each cell.",
|
||||
"cutoffMin": null,
|
||||
"cutoffMax": null,
|
||||
"nBins": 50,
|
||||
"groupBy": "sample_id",
|
||||
"yAxisType": "linear"
|
||||
},
|
||||
{
|
||||
"type": "histogram",
|
||||
"field": "cellbender_cell_probability",
|
||||
"label": "CellBender cell probability",
|
||||
"description": "CellBender's statistical confidence (0-1) that a barcode represents a real cell, with higher values indicating stronger confidence.",
|
||||
"cutoffMin": null,
|
||||
"cutoffMax": null,
|
||||
"nBins": 50,
|
||||
"groupBy": "sample_id",
|
||||
"yAxisType": "linear"
|
||||
},
|
||||
{
|
||||
"type": "histogram",
|
||||
"field": "cellbender_background_fraction",
|
||||
"label": "CellBender background fraction",
|
||||
"description": "Estimated percentage of each cell's RNA that comes from the ambient solution rather than the cell itself.",
|
||||
"cutoffMin": null,
|
||||
"cutoffMax": null,
|
||||
"nBins": 50,
|
||||
"groupBy": "sample_id",
|
||||
"yAxisType": "linear"
|
||||
},
|
||||
{
|
||||
"type": "histogram",
|
||||
"field": "cellbender_cell_size",
|
||||
"label": "CellBender cell size",
|
||||
"description": "CellBender's estimate of the true number of RNA molecules in each cell after removing ambient contamination. Reflects actual cell RNA content rather than raw UMI counts.",
|
||||
"cutoffMin": null,
|
||||
"cutoffMax": null,
|
||||
"nBins": 50,
|
||||
"groupBy": "sample_id",
|
||||
"yAxisType": "linear"
|
||||
},
|
||||
{
|
||||
"type": "histogram",
|
||||
"field": "cellbender_droplet_efficiency",
|
||||
"label": "CellBender droplet efficiency",
|
||||
"description": "CellBender's estimate of how efficiently each droplet captured RNA molecules. Higher values indicate more reliable RNA sampling within individual droplets.",
|
||||
"cutoffMin": null,
|
||||
"cutoffMax": null,
|
||||
"nBins": 50,
|
||||
"groupBy": "sample_id",
|
||||
"yAxisType": "linear"
|
||||
}
|
||||
]
|
||||
}
|
||||
]
|
||||
}
|
||||
@@ -0,0 +1,91 @@
|
||||
{
|
||||
"categories": [
|
||||
{
|
||||
"name": "Sample QC",
|
||||
"key": "sample_summary_stats",
|
||||
"additionalAxes": false,
|
||||
"defaultFilters": []
|
||||
},
|
||||
{
|
||||
"name": "Cell RNA QC",
|
||||
"key": "cell_rna_stats",
|
||||
"additionalAxes": true,
|
||||
"defaultFilters": [
|
||||
{
|
||||
"type": "histogram",
|
||||
"visualizationType": "histogram",
|
||||
"field": "total_counts",
|
||||
"label": "Total UMI per cell",
|
||||
"description": "Total number of RNA molecules detected per cell. Low values typically indicate empty droplets or low-quality cells that should be filtered out.",
|
||||
"cutoffMin": null,
|
||||
"cutoffMax": null,
|
||||
"zoomMax": null,
|
||||
"nBins": 50,
|
||||
"groupBy": "sample_id",
|
||||
"yAxisType": "linear"
|
||||
},
|
||||
{
|
||||
"type": "histogram",
|
||||
"visualizationType": "histogram",
|
||||
"field": "num_nonzero_vars",
|
||||
"label": "Number of non-zero genes per cell",
|
||||
"description": "Count of unique genes detected in each cell. Low gene counts often indicate poor-quality cells.",
|
||||
"cutoffMin": null,
|
||||
"cutoffMax": null,
|
||||
"zoomMax": null,
|
||||
"nBins": 50,
|
||||
"groupBy": "sample_id",
|
||||
"yAxisType": "linear"
|
||||
},
|
||||
{
|
||||
"type": "histogram",
|
||||
"visualizationType": "histogram",
|
||||
"field": "fraction_mitochondrial",
|
||||
"label": "Fraction UMI of mitochondrial genes per cell",
|
||||
"description": "Proportion of cell's RNA from mitochondrial genes.",
|
||||
"cutoffMin": null,
|
||||
"cutoffMax": null,
|
||||
"nBins": 50,
|
||||
"groupBy": "sample_id",
|
||||
"yAxisType": "linear"
|
||||
},
|
||||
{
|
||||
"type": "histogram",
|
||||
"visualizationType": "histogram",
|
||||
"field": "fraction_ribosomal",
|
||||
"label": "Fraction UMI of ribosomal genes per cell",
|
||||
"description": "Proportion of cell's RNA from ribosomal protein genes. Extreme values may indicate stress responses or cell cycle abnormalities.",
|
||||
"cutoffMin": null,
|
||||
"cutoffMax": null,
|
||||
"nBins": 50,
|
||||
"groupBy": "sample_id",
|
||||
"yAxisType": "linear"
|
||||
},
|
||||
{
|
||||
"type": "histogram",
|
||||
"visualizationType": "histogram",
|
||||
"field": "cell_area",
|
||||
"label": "Segmented cell area",
|
||||
"description": "Area of the segmented cells.",
|
||||
"cutoffMin": null,
|
||||
"cutoffMax": null,
|
||||
"nBins": 50,
|
||||
"groupBy": "sample_id",
|
||||
"yAxisType": "linear"
|
||||
},
|
||||
{
|
||||
"type": "histogram",
|
||||
"visualizationType": "histogram",
|
||||
"field": "nucleus_ratio",
|
||||
"label": "Nucleus Ratio",
|
||||
"description": "Ratio of the nucleus area to the segmented cell area.",
|
||||
"cutoffMin": null,
|
||||
"cutoffMax": null,
|
||||
"nBins": 50,
|
||||
"groupBy": "sample_id",
|
||||
"yAxisType": "linear"
|
||||
}
|
||||
]
|
||||
}
|
||||
]
|
||||
}
|
||||
385
src/ingestion_qc/h5mu_to_qc_json/script.py
Normal file
385
src/ingestion_qc/h5mu_to_qc_json/script.py
Normal file
@@ -0,0 +1,385 @@
|
||||
import json
|
||||
import pandas as pd
|
||||
from pathlib import Path
|
||||
import anndata as ad
|
||||
import h5py
|
||||
import sys
|
||||
import os
|
||||
import shutil
|
||||
|
||||
## VIASH START
|
||||
# inputs = list(Path("data/sample_data/sample_data").glob("*.h5mu"))
|
||||
# output = "data/sample-data.json"
|
||||
inputs = list(Path("resources_test_after_running_script/qc_sample_data").glob("*.qc.h5mu"))
|
||||
output = "tmp.json"
|
||||
par = {
|
||||
"input": sorted([str(x) for x in inputs]),
|
||||
# "input": ["resources_test/spatial_qc_sample_data/xenium_tiny.qc.h5mu", "resources_test/spatial_qc_sample_data/xenium_tiny.qc.h5mu"],
|
||||
"output": "sc_data.json",
|
||||
"output_reporting_json": "sc_report_structure.json",
|
||||
"modality": "rna",
|
||||
"ingestion_method": "cellranger_multi",
|
||||
"obs_sample_id": "sample_id",
|
||||
"obs_total_counts": "total_counts",
|
||||
"obs_num_nonzero_vars": "num_nonzero_vars",
|
||||
"obs_fraction_mitochondrial": "fraction_mitochondrial",
|
||||
"obs_fraction_ribosomal": "fraction_ribosomal",
|
||||
"min_total_counts": 20,
|
||||
"min_num_nonzero_vars": 20,
|
||||
"obs_cellbender": [
|
||||
"cellbender_background_fraction",
|
||||
"cellbender_cell_probability",
|
||||
"cellbender_cell_size",
|
||||
"cellbender_droplet_efficiency",
|
||||
],
|
||||
"uns_cellranger_metrics": "metrics_cellranger",
|
||||
"obs_metadata": ["cell_type"],
|
||||
"obs_nucleus_area": "nucleus_area",
|
||||
"obs_cell_area": "cell_area",
|
||||
"obs_x_coord": "x_coord",
|
||||
"obs_y_coord": "y_coord",
|
||||
"obs_control_probe_counts": "control_probe_counts",
|
||||
"obs_control_codeword_counts": "control_codeword_counts"
|
||||
}
|
||||
meta = {
|
||||
"resources_dir": os.path.abspath("src/ingestion_qc/h5mu_to_qc_json"),
|
||||
}
|
||||
i = 0
|
||||
mudata_file = par["input"][i]
|
||||
|
||||
sys.path.append("src/utils")
|
||||
## VIASH END
|
||||
|
||||
sys.path.append(meta["resources_dir"])
|
||||
from setup_logger import setup_logger
|
||||
|
||||
logger = setup_logger()
|
||||
|
||||
par["obs_cellbender"] = {} if not par["obs_cellbender"] else par["obs_cellbender"]
|
||||
|
||||
|
||||
def transform_df(df):
|
||||
"""Transform a DataFrame into the annotation object format."""
|
||||
columns = []
|
||||
for name in df.columns:
|
||||
data = df[name]
|
||||
|
||||
# Determine dtype
|
||||
if pd.api.types.is_integer_dtype(data):
|
||||
dtype = "integer"
|
||||
elif pd.api.types.is_float_dtype(data):
|
||||
dtype = "numeric"
|
||||
elif pd.api.types.is_categorical_dtype(data):
|
||||
dtype = "categorical"
|
||||
else:
|
||||
raise ValueError(f"Unknown/unsupported data type for column {name}")
|
||||
|
||||
column_info = {"name": name, "dtype": dtype}
|
||||
|
||||
if dtype == "categorical":
|
||||
column_info["data"] = data.cat.codes.tolist()
|
||||
column_info["categories"] = data.cat.categories.tolist()
|
||||
else:
|
||||
column_info["data"] = [None if pd.isna(x) else x for x in data]
|
||||
|
||||
columns.append(column_info)
|
||||
|
||||
return {"num_rows": len(df), "num_cols": len(df.columns), "min_total_counts": par["min_total_counts"], "min_num_nonzero_vars": par["min_num_nonzero_vars"], "columns": columns}
|
||||
|
||||
|
||||
def check_optional_obs_keys(obs, keys, message):
|
||||
missing_keys = [key for key in keys if key not in obs.columns]
|
||||
if missing_keys:
|
||||
logger.info(f"Missing keys in obs: {', '.join(missing_keys)}. {message}")
|
||||
|
||||
|
||||
def transform_cellranger_metrics(uns, sample_id):
|
||||
if not par["uns_cellranger_metrics"] in uns:
|
||||
raise ValueError(f"Could not find cellranger metrics in uns: {par['uns_cellranger_metrics']}. Provide correct value for --uns_cellranger_metrics or make sure data was ingested using CellRanger multi.")
|
||||
|
||||
cellranger_metrics = (
|
||||
uns[par["uns_cellranger_metrics"]]
|
||||
.pivot_table(
|
||||
index=[],
|
||||
columns="Metric Name",
|
||||
values="Metric Value",
|
||||
aggfunc="first",
|
||||
)
|
||||
.reset_index(drop=True)
|
||||
)
|
||||
|
||||
cellranger_metrics.columns.name = None
|
||||
# Remove thousands separator and convert to numeric
|
||||
cellranger_metrics = cellranger_metrics.map(
|
||||
lambda x: (
|
||||
pd.to_numeric(x.replace(",", ""), errors="coerce")
|
||||
if isinstance(x, str)
|
||||
else x
|
||||
)
|
||||
)
|
||||
# Replace spaces with underscores in column names
|
||||
cellranger_metrics.columns = cellranger_metrics.columns.str.replace(" ", "_")
|
||||
for col in cellranger_metrics.columns:
|
||||
cellranger_metrics[col] = pd.to_numeric(cellranger_metrics[col], errors="coerce")
|
||||
cellranger_metrics["sample_id"] = [sample_id[0]]
|
||||
|
||||
return cellranger_metrics
|
||||
|
||||
|
||||
def format_cellbender_columns(mod_obs):
|
||||
# Check if celbender was run on the dataset
|
||||
if par["obs_cellbender"]:
|
||||
check_optional_obs_keys(mod_obs, par["obs_cellbender"], "Run cellbender first to include these metrics.")
|
||||
|
||||
cellbender_obs_keys = [column for column in par["obs_cellbender"] if column in mod_obs]
|
||||
|
||||
for key in cellbender_obs_keys:
|
||||
if not pd.api.types.is_float_dtype(mod_obs[key]):
|
||||
try:
|
||||
mod_obs[key] = mod_obs[key].astype("float16")
|
||||
except ValueError:
|
||||
raise ValueError(f"Could not convert column {key} to a float dtype. Please make sure all cellbender metrics are numeric.")
|
||||
|
||||
return cellbender_obs_keys, mod_obs
|
||||
|
||||
|
||||
def format_required_columns(required_keys, mod_obs):
|
||||
|
||||
for key in required_keys:
|
||||
if not pd.api.types.is_numeric_dtype(mod_obs[key]):
|
||||
raise ValueError(f"Column {key} must be a numeric dtype.")
|
||||
|
||||
if not pd.api.types.is_integer_dtype(mod_obs[par["obs_total_counts"]]):
|
||||
logger.info(f"Converting {par['obs_total_counts']} from {mod_obs[par['obs_total_counts']].dtype} to integer dtype...")
|
||||
mod_obs[par["obs_total_counts"]] = mod_obs[par["obs_total_counts"]].astype(int)
|
||||
|
||||
if not pd.api.types.is_integer_dtype(mod_obs[par["obs_num_nonzero_vars"]]):
|
||||
logger.info(f"Converting {par['obs_num_nonzero_vars']} from {mod_obs[par['obs_num_nonzero_vars']].dtype} to integer dtype...")
|
||||
mod_obs[par["obs_num_nonzero_vars"]] = mod_obs[par["obs_num_nonzero_vars"]].astype(int)
|
||||
|
||||
if not pd.api.types.is_float_dtype(mod_obs[par["obs_fraction_mitochondrial"]]):
|
||||
logger.info(f"Converting {par['obs_fraction_mitochondrial']} from {mod_obs[par['obs_fraction_mitochondrial']].dtype} to float dtype...")
|
||||
mod_obs[par["obs_fraction_mitochondrial"]] = mod_obs[par["obs_fraction_mitochondrial"]].astype("float16")
|
||||
|
||||
if not pd.api.types.is_float_dtype(mod_obs[par["obs_fraction_ribosomal"]]):
|
||||
logger.info(f"Converting {par['obs_fraction_ribosomal']} from {mod_obs[par['obs_fraction_ribosomal']].dtype} to float dtype...")
|
||||
mod_obs[par["obs_fraction_ribosomal"]] = mod_obs[par["obs_fraction_ribosomal"]].astype("float16")
|
||||
|
||||
return mod_obs
|
||||
|
||||
|
||||
def format_categorical_columns(mod_obs):
|
||||
# Fetch all categorical columns for grouping if no columns are provided
|
||||
if not par["obs_metadata"]:
|
||||
metadata_obs_keys = mod_obs.select_dtypes(include=["object", "category"]).columns.tolist()
|
||||
if par["obs_sample_id"] in metadata_obs_keys:
|
||||
metadata_obs_keys.remove(par["obs_sample_id"])
|
||||
else:
|
||||
check_optional_obs_keys(mod_obs, par["obs_metadata"], "Make sure requested metadata colmuns are present in obs.")
|
||||
metadata_obs_keys = [key for key in par["obs_metadata"] if key in mod_obs]
|
||||
|
||||
for key in metadata_obs_keys:
|
||||
if not isinstance(key, pd.CategoricalDtype):
|
||||
logger.info(f"{key} is not a categorical dtype. Converting {key} from {mod_obs[key].dtype} to categorical dtype...")
|
||||
mod_obs[key] = mod_obs[key].astype(str).astype("category")
|
||||
|
||||
return metadata_obs_keys, mod_obs
|
||||
|
||||
|
||||
def generate_cellranger_stats(mod_obs, uns, sample_id, required_keys):
|
||||
|
||||
# Format required columns
|
||||
mod_obs = format_required_columns(required_keys, mod_obs)
|
||||
|
||||
# Fetch and format all categorical columns for grouping
|
||||
metadata_obs_keys, mod_obs = format_categorical_columns(mod_obs)
|
||||
|
||||
# Fetch and format cellbender columns
|
||||
cellbender_obs_keys, mod_obs = format_cellbender_columns(mod_obs)
|
||||
|
||||
# Create cell RNA stats dataframe
|
||||
cell_rna_stats = pd.DataFrame(
|
||||
{
|
||||
"sample_id": pd.Categorical(sample_id),
|
||||
**{key: mod_obs[key] for key in required_keys},
|
||||
**{key: mod_obs[key] for key in cellbender_obs_keys},
|
||||
**{key: mod_obs[key] for key in metadata_obs_keys},
|
||||
}
|
||||
)
|
||||
|
||||
cellranger_stats = transform_cellranger_metrics(uns, sample_id)
|
||||
|
||||
return cell_rna_stats, cellranger_stats
|
||||
|
||||
|
||||
def format_xenium_columns(mod_obs):
|
||||
|
||||
mod_obs["nucleus_ratio"] = mod_obs[par["obs_nucleus_area"]] / mod_obs[par["obs_cell_area"]]
|
||||
|
||||
xenium_formatted_columns = [par["obs_cell_area"], "nucleus_ratio", "x_coord", "y_coord"]
|
||||
for key in xenium_formatted_columns:
|
||||
mod_obs[key] = mod_obs[key].astype("float16")
|
||||
|
||||
return mod_obs, xenium_formatted_columns
|
||||
|
||||
|
||||
def generate_xenium_stats(mod_obs, sample_id, required_keys):
|
||||
|
||||
# Format required columns
|
||||
mod_obs = format_required_columns(required_keys, mod_obs)
|
||||
|
||||
# Format xenium-specific columns
|
||||
mod_obs, xenium_formatted_columns = format_xenium_columns(mod_obs)
|
||||
|
||||
# Fetch and format all categorical columns for grouping
|
||||
metadata_obs_keys, mod_obs = format_categorical_columns(mod_obs)
|
||||
|
||||
# Create cell RNA stats dataframe
|
||||
cell_rna_stats = pd.DataFrame(
|
||||
{
|
||||
"sample_id": pd.Categorical(sample_id),
|
||||
**{key: mod_obs[key] for key in required_keys},
|
||||
**{key: mod_obs[key] for key in xenium_formatted_columns},
|
||||
**{key: mod_obs[key] for key in metadata_obs_keys}
|
||||
}
|
||||
)
|
||||
|
||||
return cell_rna_stats
|
||||
|
||||
|
||||
def concatenate_dataframes(dfs):
|
||||
'''Concatenates a list of dataframes into a single dataframe, preserving categorical columns.'''
|
||||
df = pd.concat(dfs, ignore_index=True)
|
||||
|
||||
# Find categorical columns that became object columms
|
||||
for col in df.columns:
|
||||
if any(df[col].dtype.name == 'category' for df in dfs if col in df.columns):
|
||||
# Get all categorical series for this column
|
||||
cat_series = [df[col] for df in dfs if col in df.columns and df[col].dtype.name == 'category']
|
||||
if cat_series:
|
||||
# Union the categories and apply to result
|
||||
unioned = pd.api.types.union_categoricals(cat_series)
|
||||
df[col] = pd.Categorical(df[col], categories=unioned.categories)
|
||||
return df
|
||||
|
||||
|
||||
def main(par):
|
||||
cell_stats_dfs = []
|
||||
sample_stats_dfs = []
|
||||
metrics_cellranger_dfs = []
|
||||
|
||||
for i, mudata_file in enumerate(par["input"]):
|
||||
logger.info(f"Processing {mudata_file}")
|
||||
|
||||
# read h5mu file
|
||||
file = h5py.File(mudata_file, "r")
|
||||
|
||||
# read the necessary info
|
||||
grp_mod = file["mod"][par["modality"]]
|
||||
mod_obs = ad.experimental.read_elem(grp_mod["obs"])
|
||||
mod_obsm = ad.experimental.read_elem(grp_mod["obsm"])
|
||||
uns = ad.experimental.read_elem(file["uns"])
|
||||
|
||||
# close the h5mu file
|
||||
file.close()
|
||||
|
||||
barcodes_original_count = mod_obs.shape[0]
|
||||
|
||||
# Add coordinates to obs before filtering
|
||||
if par["ingestion_method"] == "xenium":
|
||||
mod_obs["x_coord"] = mod_obsm["spatial"][:, 0]
|
||||
mod_obs["y_coord"] = mod_obsm["spatial"][:, 1]
|
||||
|
||||
# Pre-filter cells
|
||||
logger.info("Pre-filtering cells based on counts...")
|
||||
if "min_total_counts" in par:
|
||||
mod_obs = mod_obs[mod_obs["total_counts"] >= par["min_total_counts"]]
|
||||
if "min_num_nonzero_vars" in par:
|
||||
mod_obs = mod_obs[mod_obs["num_nonzero_vars"] >= par["min_num_nonzero_vars"]]
|
||||
barcodes_filtered_count = mod_obs.shape[0]
|
||||
|
||||
# Detect sample id's
|
||||
logger.info("Detecting sample id's...")
|
||||
sample_id = (
|
||||
mod_obs[par["obs_sample_id"]].tolist()
|
||||
if par["obs_sample_id"] in mod_obs.columns
|
||||
else [f"sample_{i}"] * mod_obs.shape[0]
|
||||
)
|
||||
|
||||
# Generating sample summary statistics
|
||||
logger.info("Generating sample summary statistics...")
|
||||
required_keys = [
|
||||
par["obs_total_counts"],
|
||||
par["obs_num_nonzero_vars"],
|
||||
par["obs_fraction_mitochondrial"],
|
||||
par["obs_fraction_ribosomal"]
|
||||
]
|
||||
missing_keys = [key for key in required_keys if key not in mod_obs.columns]
|
||||
if missing_keys:
|
||||
raise ValueError(f"Missing keys in obs: {', '.join(missing_keys)}")
|
||||
|
||||
sample_summary = {
|
||||
"sample_id": pd.Categorical([sample_id[0]]),
|
||||
"rna_num_barcodes": [barcodes_original_count],
|
||||
"rna_num_barcodes_filtered": [barcodes_filtered_count],
|
||||
"rna_sum_total_counts": [mod_obs[par["obs_total_counts"]].sum()],
|
||||
"rna_median_total_counts": [mod_obs[par["obs_total_counts"]].median()],
|
||||
"rna_overall_num_nonzero_vars": [mod_obs[par["obs_num_nonzero_vars"]].sum()],
|
||||
"rna_median_num_nonzero_vars": [mod_obs[par["obs_num_nonzero_vars"]].median()],
|
||||
}
|
||||
|
||||
if par["ingestion_method"] == "xenium":
|
||||
sample_summary["control_probe_percentage"] = mod_obs[par["obs_control_probe_counts"]].sum() / mod_obs["total_counts"].sum() * 100
|
||||
sample_summary["negative_decoding_percentage"] = mod_obs[par["obs_control_codeword_counts"]].sum() / mod_obs["total_counts"].sum() * 100
|
||||
|
||||
sample_summary_stats = pd.DataFrame(sample_summary)
|
||||
|
||||
if par["ingestion_method"] == "cellranger_multi":
|
||||
cell_rna_stats, cellranger_stats = generate_cellranger_stats(mod_obs, uns, sample_id, required_keys)
|
||||
metrics_cellranger_dfs.append(cellranger_stats)
|
||||
|
||||
if par["ingestion_method"] == "xenium":
|
||||
cell_rna_stats = generate_xenium_stats(mod_obs, sample_id, required_keys)
|
||||
|
||||
cell_stats_dfs.append(cell_rna_stats)
|
||||
sample_stats_dfs.append(sample_summary_stats)
|
||||
|
||||
# Combine dataframes of all samples
|
||||
logger.info("Combining data of all samples into single object...")
|
||||
combined_cell_stats = concatenate_dataframes(cell_stats_dfs)
|
||||
combined_sample_stats = concatenate_dataframes(sample_stats_dfs)
|
||||
if par["ingestion_method"] == "cellranger_multi":
|
||||
combined_metrics_cellranger = concatenate_dataframes(metrics_cellranger_dfs)
|
||||
|
||||
report_categories = [combined_cell_stats, combined_sample_stats]
|
||||
|
||||
if par["ingestion_method"] == "cellranger_multi":
|
||||
report_categories.append(combined_metrics_cellranger)
|
||||
|
||||
for df in report_categories:
|
||||
df["sample_id"] = pd.Categorical(df["sample_id"])
|
||||
|
||||
output = {
|
||||
"cell_rna_stats": transform_df(combined_cell_stats),
|
||||
"sample_summary_stats": transform_df(combined_sample_stats)
|
||||
}
|
||||
|
||||
if par["ingestion_method"] == "cellranger_multi":
|
||||
output["metrics_cellranger_stats"] = transform_df(combined_metrics_cellranger)
|
||||
|
||||
logger.info(f"Writing output data json to {par['output']}")
|
||||
output_path = Path(par["output"])
|
||||
with open(output_path, "w") as f:
|
||||
json.dump(output, f, indent=2)
|
||||
|
||||
report_structures = {
|
||||
"cellranger_multi": os.path.join(meta["resources_dir"], "report_structure/cellranger.json"),
|
||||
"xenium": os.path.join(meta["resources_dir"], "report_structure/xenium.json")
|
||||
}
|
||||
|
||||
logger.info(f"Writing output report structure json to {par['output_reporting_json']}")
|
||||
shutil.copy(report_structures[par["ingestion_method"]], par["output_reporting_json"])
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main(par)
|
||||
144
src/ingestion_qc/h5mu_to_qc_json/test.py
Normal file
144
src/ingestion_qc/h5mu_to_qc_json/test.py
Normal file
@@ -0,0 +1,144 @@
|
||||
import pytest
|
||||
import os
|
||||
import json
|
||||
import sys
|
||||
import numpy as np
|
||||
|
||||
## VIASH START
|
||||
meta = {
|
||||
"resources_dir": "resources_test",
|
||||
"executable": "./target/executable/ingestion_qc/h5mu_to_qc_json/h5mu_to_qc_json"
|
||||
}
|
||||
## VIASH END
|
||||
|
||||
|
||||
def test_cellranger_execution(run_component, tmp_path):
|
||||
output_json_path = tmp_path / "output.json"
|
||||
output_reporting_json_path = tmp_path / "output_reporting.json"
|
||||
|
||||
run_component(
|
||||
[
|
||||
"--input", meta["resources_dir"] + "/resources_test/qc_sample_data/sample_one.qc.cellbender.h5mu",
|
||||
"--input", meta["resources_dir"] + "/resources_test/qc_sample_data/sample_two.qc.cellbender.h5mu",
|
||||
"--ingestion_method", "cellranger_multi",
|
||||
"--output", output_json_path,
|
||||
"--output_reporting_json", output_reporting_json_path
|
||||
]
|
||||
)
|
||||
|
||||
assert os.path.exists(output_json_path), "Output file was not created"
|
||||
|
||||
with open(output_json_path, "r") as f:
|
||||
output_json_dict = json.load(f)
|
||||
|
||||
assert output_json_dict.keys() == {"cell_rna_stats", "sample_summary_stats", "metrics_cellranger_stats"}
|
||||
|
||||
column_names_cell = [col["name"] for col in output_json_dict["cell_rna_stats"]["columns"]]
|
||||
expected_column_names = [
|
||||
"sample_id", "total_counts", "num_nonzero_vars",
|
||||
"fraction_mitochondrial", "fraction_ribosomal",
|
||||
"cellbender_background_fraction", "cellbender_cell_probability",
|
||||
"cellbender_cell_size", "cellbender_droplet_efficiency",
|
||||
"donor_id", "cell_type", "batch", "condition"
|
||||
]
|
||||
|
||||
assert np.all([column in column_names_cell for column in expected_column_names])
|
||||
|
||||
for key in output_json_dict.keys():
|
||||
assert output_json_dict[key].keys() == {"num_rows", "num_cols", "min_total_counts", "min_num_nonzero_vars", "columns"}
|
||||
for col in output_json_dict[key]["columns"]:
|
||||
assert {"name", "dtype", "data"}.issubset(col.keys())
|
||||
|
||||
|
||||
def test_set_filters(run_component, tmp_path):
|
||||
output_json_path = tmp_path / "output.json"
|
||||
output_reporting_json_path = tmp_path / "output_reporting.json"
|
||||
|
||||
run_component(
|
||||
[
|
||||
"--input", meta["resources_dir"] + "/resources_test/qc_sample_data/sample_one.qc.cellbender.h5mu",
|
||||
"--input", meta["resources_dir"] + "/resources_test/qc_sample_data/sample_two.qc.cellbender.h5mu",
|
||||
"--ingestion_method", "cellranger_multi",
|
||||
"--output", output_json_path,
|
||||
"--output_reporting_json", output_reporting_json_path,
|
||||
"--obs_sample_id", "sample_id",
|
||||
"--obs_total_counts", "total_counts",
|
||||
"--obs_num_nonzero_vars", "num_nonzero_vars",
|
||||
"--obs_fraction_mitochondrial", "fraction_mitochondrial",
|
||||
"--obs_fraction_ribosomal", "fraction_ribosomal",
|
||||
"--min_total_counts", "20",
|
||||
"--min_num_nonzero_vars", "20",
|
||||
"--obs_metadata", "cell_type"
|
||||
]
|
||||
)
|
||||
|
||||
assert os.path.exists(output_json_path), "Output file was not created"
|
||||
|
||||
with open(output_json_path, "r") as f:
|
||||
output_json_dict = json.load(f)
|
||||
|
||||
assert output_json_dict.keys() == {"cell_rna_stats", "sample_summary_stats", "metrics_cellranger_stats"}
|
||||
|
||||
column_names = [col["name"] for col in output_json_dict["cell_rna_stats"]["columns"]]
|
||||
expected_column_names = [
|
||||
"sample_id", "total_counts", "num_nonzero_vars",
|
||||
"fraction_mitochondrial", "fraction_ribosomal",
|
||||
"cellbender_background_fraction", "cellbender_cell_probability",
|
||||
"cellbender_cell_size", "cellbender_droplet_efficiency",
|
||||
"cell_type"
|
||||
]
|
||||
unexpected_column_names = ["batch", "condition", "donor_id"]
|
||||
assert np.all([column in column_names for column in expected_column_names])
|
||||
assert np.all([column not in column_names for column in unexpected_column_names])
|
||||
for key in output_json_dict.keys():
|
||||
assert output_json_dict[key].keys() == {"num_rows", "num_cols", "min_total_counts", "min_num_nonzero_vars", "columns"}
|
||||
for col in output_json_dict[key]["columns"]:
|
||||
assert {"name", "dtype", "data"}.issubset(col.keys())
|
||||
|
||||
total_counts = next(col for col in output_json_dict["cell_rna_stats"]["columns"] if col["name"] == "total_counts")
|
||||
assert min(total_counts["data"]) >= 20
|
||||
|
||||
num_nonzero_vars = next(col for col in output_json_dict["cell_rna_stats"]["columns"] if col["name"] == "num_nonzero_vars")
|
||||
assert min(num_nonzero_vars["data"]) >= 20
|
||||
|
||||
|
||||
def test_xenium_execution(run_component, tmp_path):
|
||||
output_json_path = tmp_path / "output.json"
|
||||
output_reporting_json_path = tmp_path / "output_reporting.json"
|
||||
|
||||
run_component(
|
||||
[
|
||||
"--input", meta["resources_dir"] + "/resources_test/spatial_qc_sample_data/xenium_tiny.qc.h5mu",
|
||||
"--input", meta["resources_dir"] + "/resources_test/spatial_qc_sample_data/xenium_tiny.qc.h5mu",
|
||||
"--ingestion_method", "xenium",
|
||||
"--min_num_nonzero_vars", "1",
|
||||
"--output", output_json_path,
|
||||
"--output_reporting_json", output_reporting_json_path
|
||||
]
|
||||
)
|
||||
|
||||
assert os.path.exists(output_json_path), "Output file was not created"
|
||||
|
||||
with open(output_json_path, "r") as f:
|
||||
output_json_dict = json.load(f)
|
||||
|
||||
assert output_json_dict.keys() == {"cell_rna_stats", "sample_summary_stats"}
|
||||
assert "metrics_cellranger_stats" not in output_json_dict.keys()
|
||||
|
||||
column_names_cell = [col["name"] for col in output_json_dict["cell_rna_stats"]["columns"]]
|
||||
expected_column_names = [
|
||||
"sample_id", "total_counts", "num_nonzero_vars",
|
||||
"fraction_mitochondrial", "fraction_ribosomal",
|
||||
"cell_area", "nucleus_ratio",
|
||||
"x_coord", "y_coord", "cell_id", "segmentation_method", "region"
|
||||
]
|
||||
assert np.all([column in column_names_cell for column in expected_column_names])
|
||||
|
||||
for key in output_json_dict.keys():
|
||||
assert output_json_dict[key].keys() == {"num_rows", "num_cols", "min_total_counts", "min_num_nonzero_vars", "columns"}
|
||||
for col in output_json_dict[key]["columns"]:
|
||||
assert {"name", "dtype", "data"}.issubset(col.keys())
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
sys.exit(pytest.main([__file__]))
|
||||
12
src/utils/setup_logger.py
Normal file
12
src/utils/setup_logger.py
Normal file
@@ -0,0 +1,12 @@
|
||||
def setup_logger():
|
||||
import logging
|
||||
from sys import stdout
|
||||
|
||||
logger = logging.getLogger()
|
||||
logger.setLevel(logging.INFO)
|
||||
console_handler = logging.StreamHandler(stdout)
|
||||
logFormatter = logging.Formatter("%(asctime)s %(levelname)-8s %(message)s")
|
||||
console_handler.setFormatter(logFormatter)
|
||||
logger.addHandler(console_handler)
|
||||
|
||||
return logger
|
||||
147
src/workflows/generate_qc_report/config.vsh.yaml
Normal file
147
src/workflows/generate_qc_report/config.vsh.yaml
Normal file
@@ -0,0 +1,147 @@
|
||||
name: generate_qc_report
|
||||
namespace: workflows
|
||||
description: Run the ingestion QC report generation
|
||||
authors:
|
||||
- __merge__: /src/authors/jakub_majercik.yaml
|
||||
roles: [author]
|
||||
- __merge__: /src/authors/dorien_roosen.yaml
|
||||
roles: [author]
|
||||
- __merge__: /src/authors/robrecht_cannoodt.yaml
|
||||
roles: [author]
|
||||
- __merge__: /src/authors/weiwei_schultz.yaml
|
||||
roles: [contributor]
|
||||
argument_groups:
|
||||
# TO DO: it would be nice if the sample metadata was already
|
||||
# included in the h5mu files, so that we don't need to pass it.
|
||||
- name: Inputs
|
||||
arguments:
|
||||
- name: --id
|
||||
type: string
|
||||
required: false
|
||||
direction: input
|
||||
description: |
|
||||
The sample IDs to include in the report. If not provided,
|
||||
the sample IDs will be extracted from the h5mu files.
|
||||
example: sample1
|
||||
- name: --input
|
||||
type: file
|
||||
required: true
|
||||
direction: input
|
||||
description: The input h5mu files.
|
||||
example: path/to/file1.h5mu
|
||||
- name: --ingestion_method
|
||||
type: string
|
||||
required: true
|
||||
choices:
|
||||
- cellranger_multi
|
||||
- xenium
|
||||
- name: --sample_metadata
|
||||
type: file
|
||||
required: false
|
||||
direction: input
|
||||
description: |
|
||||
The sample metadata file corresponding to .obs fields in the h5mu input files, to be used for grouping in the report.
|
||||
example: path/to/file.csv
|
||||
- name: --max_samples_per_report
|
||||
type: integer
|
||||
default: 20
|
||||
description: |
|
||||
The maximum number of samples to be included per report.
|
||||
Multiple reports will be generated (with samples equally divided over all reports) if number of input samples exceeds this threshold.
|
||||
- name: Options
|
||||
arguments:
|
||||
- name: "--var_gene_names"
|
||||
example: "gene_symbol"
|
||||
type: string
|
||||
description: |
|
||||
The column name in the .var h5mu files that contains the gene names. If not provided, .var_names will be used.
|
||||
- name: --obs_metadata
|
||||
type: string
|
||||
multiple: true
|
||||
description: The metadata keys in the h5mu .obs to include in the report.
|
||||
example: [donor_id, cell_type, batch, condition]
|
||||
|
||||
- name: QC options
|
||||
arguments:
|
||||
- name: "--var_name_mitochondrial_genes"
|
||||
type: string
|
||||
required: false
|
||||
default: "mitochondrial"
|
||||
description: |
|
||||
In which .var slot to store a boolean array corresponding the mitochondrial genes.
|
||||
- name: "--var_name_ribosomal_genes"
|
||||
type: string
|
||||
required: false
|
||||
default: "ribosomal"
|
||||
description: |
|
||||
In which .var slot to store a boolean array corresponding the ribosomal genes.
|
||||
- name: --min_total_counts
|
||||
type: integer
|
||||
description: |
|
||||
Minimum total counts for a cell to be included in the output.
|
||||
default: 10
|
||||
min: 1
|
||||
- name: --min_num_nonzero_vars
|
||||
type: integer
|
||||
description: |
|
||||
Minimum number of nonzero vars for a cell to be included in the output.
|
||||
default: 10
|
||||
min: 1
|
||||
|
||||
- name: Cellbender options
|
||||
arguments:
|
||||
- name: "--run_cellbender"
|
||||
type: boolean
|
||||
required: false
|
||||
description: Whether to run cellbender or not.
|
||||
default: false
|
||||
- name: "--cellbender_epochs"
|
||||
type: integer
|
||||
required: false
|
||||
description: Number of epochs to train cellbender.
|
||||
default: 150
|
||||
|
||||
- name: Outputs
|
||||
arguments:
|
||||
- name: --output_qc_report
|
||||
type: file
|
||||
required: true
|
||||
multiple: true
|
||||
direction: output
|
||||
description: The output HTML report
|
||||
example: path/to/file.html
|
||||
- name: --output_processed_h5mu
|
||||
type: file
|
||||
required: true
|
||||
direction: output
|
||||
description: Folder containing the processed h5mu files.
|
||||
default: qc_h5mu
|
||||
|
||||
resources:
|
||||
- type: nextflow_script
|
||||
entrypoint: run_wf
|
||||
path: main.nf
|
||||
|
||||
test_resources:
|
||||
- type: nextflow_script
|
||||
path: test.nf
|
||||
entrypoint: test_no_cellbender
|
||||
- type: nextflow_script
|
||||
path: test.nf
|
||||
entrypoint: test_with_cellbender
|
||||
|
||||
dependencies:
|
||||
- name: metadata/add_id
|
||||
repository: openpipeline
|
||||
- name: workflows/qc/qc
|
||||
repository: openpipeline
|
||||
- name: correction/cellbender_remove_background
|
||||
alias: cellbender
|
||||
repository: openpipeline
|
||||
- name: ingestion_qc/h5mu_to_qc_json
|
||||
- name: ingestion_qc/generate_html
|
||||
- name: move_files_to_directory
|
||||
repository: craftbox
|
||||
|
||||
runners:
|
||||
- type: nextflow
|
||||
37
src/workflows/generate_qc_report/integration_test.sh
Executable file
37
src/workflows/generate_qc_report/integration_test.sh
Executable file
@@ -0,0 +1,37 @@
|
||||
#!/usr/bin/env bash
|
||||
|
||||
# get the root of the directory
|
||||
REPO_ROOT=$(git rev-parse --show-toplevel)
|
||||
|
||||
# ensure that the command below is run from the root of the repository
|
||||
cd "$REPO_ROOT"
|
||||
|
||||
viash ns build --setup cb -q generate_qc_report
|
||||
|
||||
nextflow run . \
|
||||
-main-script src/workflows/generate_qc_report/test.nf \
|
||||
-profile docker,no_publish,local \
|
||||
-entry test_no_cellbender \
|
||||
-c src/configs/labels_ci.config \
|
||||
-resume
|
||||
|
||||
nextflow run . \
|
||||
-main-script src/workflows/generate_qc_report/test.nf \
|
||||
-profile docker,no_publish,local \
|
||||
-entry test_xenium \
|
||||
-c src/configs/labels_ci.config \
|
||||
-resume
|
||||
|
||||
nextflow run . \
|
||||
-main-script src/workflows/generate_qc_report/test.nf \
|
||||
-profile docker,no_publish,local \
|
||||
-entry test_with_cellbender \
|
||||
-c src/configs/labels_ci.config \
|
||||
-resume
|
||||
|
||||
nextflow run . \
|
||||
-main-script src/workflows/generate_qc_report/test.nf \
|
||||
-profile docker,no_publish,local \
|
||||
-entry test_multiple_reports \
|
||||
-c src/configs/labels_ci.config \
|
||||
-resume
|
||||
208
src/workflows/generate_qc_report/main.nf
Normal file
208
src/workflows/generate_qc_report/main.nf
Normal file
@@ -0,0 +1,208 @@
|
||||
workflow run_wf {
|
||||
take: input_ch
|
||||
main:
|
||||
qc_ch = input_ch
|
||||
// store join id
|
||||
| map { id, state ->
|
||||
[id, state + [_meta: [join_id: id]]]
|
||||
}
|
||||
|
||||
// add sample ids to each state
|
||||
| add_id.run(
|
||||
fromState: [
|
||||
input_id: "id",
|
||||
input: "input"
|
||||
],
|
||||
args: [
|
||||
obs_output: "sample_id"
|
||||
],
|
||||
toState: [ "input": "output" ]
|
||||
)
|
||||
|
||||
// run cellbender
|
||||
| cellbender.run(
|
||||
runIf: {id, state -> state.run_cellbender},
|
||||
fromState: [
|
||||
id: "id",
|
||||
input: "input",
|
||||
epochs: "cellbender_epochs",
|
||||
],
|
||||
args: [
|
||||
obs_background_fraction: "cellbender_background_fraction",
|
||||
obs_cell_probability: "cellbender_cell_probability",
|
||||
obs_droplet_efficiency: "cellbender_droplet_efficiency",
|
||||
obs_cell_size: "cellbender_cell_size",
|
||||
],
|
||||
toState: ["input": "output"]
|
||||
)
|
||||
|
||||
// run qc on each sample
|
||||
| qc.run(
|
||||
fromState: [
|
||||
id: "id",
|
||||
input: "input",
|
||||
var_gene_names: "var_gene_names"
|
||||
],
|
||||
args: [
|
||||
var_name_mitochondrial_genes: "mitochondrial",
|
||||
var_name_ribosomal_genes: "ribosomal",
|
||||
output_obs_num_nonzero_vars: "num_nonzero_vars",
|
||||
output_obs_total_counts_vars: "total_counts"
|
||||
],
|
||||
toState: { id, output, state ->
|
||||
def keysToRemove = ["var_gene_names", "var_name_mitochondrial_genes", "var_name_ribosomal_genes", "run_cellbender", "cellbender_epochs"]
|
||||
def newState = state.findAll{it.key !in keysToRemove}
|
||||
newState + ["input": output.output]
|
||||
}
|
||||
)
|
||||
|
||||
| joinStates { ids, states ->
|
||||
def newId = "qc_data"
|
||||
// gather keys with unique values across states that should be combined
|
||||
def new_state_non_unique_values = [
|
||||
input: states.collect{it.input},
|
||||
join_ids: states.collect{it._meta.join_id},
|
||||
_meta: [join_id: ids[0]]
|
||||
]
|
||||
// gather keys from different states
|
||||
def all_state_keys = states.inject([].toSet()){ current_keys, state ->
|
||||
def new_keys = current_keys + state.keySet()
|
||||
return new_keys
|
||||
}.minus(["output", "id", "input", "_meta"])
|
||||
// Create the new state from the keys, values should be the same across samples
|
||||
def new_state = all_state_keys.inject([:]){ old_state, argument_name ->
|
||||
argument_values = states.collect{it.get(argument_name)}.unique()
|
||||
assert argument_values.size() == 1, "Arguments should be the same across samples. Argument name: $argument_name, \
|
||||
argument value: $argument_values"
|
||||
// take the unique value from the set (there is only one)
|
||||
def argument_value
|
||||
argument_values.each { argument_value = it }
|
||||
def current_state = old_state + [(argument_name): argument_value]
|
||||
return current_state
|
||||
}
|
||||
def data_state = new_state_non_unique_values + new_state
|
||||
[ newId, data_state ]
|
||||
}
|
||||
|
||||
processed_files_ch = qc_ch
|
||||
|
||||
// move all processed h5mu files to the same folder
|
||||
| move_files_to_directory.run(
|
||||
fromState: [
|
||||
input: "input",
|
||||
output: "output_processed_h5mu"
|
||||
],
|
||||
toState: [ "output_processed_h5mu": "output" ]
|
||||
)
|
||||
| setState(["output_processed_h5mu"])
|
||||
|
||||
report_ch = qc_ch
|
||||
// group the processed samples to generate one or multiple reports
|
||||
| flatMap { id, state ->
|
||||
|
||||
// calculate number of reports to be generated and number of samples per report
|
||||
def totalInputs = state.input.size()
|
||||
def maxSamplesPerGroup = state.max_samples_per_report
|
||||
def numGroups = Math.max(1, Math.ceil(totalInputs / maxSamplesPerGroup) as Integer)
|
||||
def baseSamplesPerGroup = totalInputs.intdiv(numGroups)
|
||||
def remainder = totalInputs % numGroups
|
||||
|
||||
println "Dividing ${totalInputs} sample(s) over ${numGroups} report(s) (max ${maxSamplesPerGroup} per report)"
|
||||
|
||||
// sort inputs to make grouping deterministic
|
||||
def inputs = []
|
||||
for (int i = 0; i < state.input.size(); i++) {
|
||||
inputs << [input: state.input[i], _meta: [join_id: state.join_ids[i]]]
|
||||
}
|
||||
|
||||
def sortedInputs = inputs.sort { it._meta.join_id }
|
||||
|
||||
def groups = []
|
||||
def itemIndex = 0
|
||||
|
||||
// create one channel per report
|
||||
(0..<numGroups).each { groupNum ->
|
||||
def samplesInGroup = baseSamplesPerGroup + (groupNum < remainder ? 1 : 0)
|
||||
def groupItems = sortedInputs[itemIndex..<(itemIndex + samplesInGroup)]
|
||||
|
||||
def newId = "combined_${groupNum + 1}_of_${numGroups}"
|
||||
def newState = state.clone() // Copy all the original state
|
||||
|
||||
// Override the input and _meta with the grouped items
|
||||
newState.input = groupItems.collect { it.input }
|
||||
newState._meta = groupItems[0]._meta
|
||||
|
||||
println "Group ${groupNum + 1}: ${samplesInGroup} samples - ${newState._meta}"
|
||||
|
||||
groups << [newId, newState]
|
||||
itemIndex += samplesInGroup
|
||||
}
|
||||
|
||||
return groups
|
||||
}
|
||||
|
||||
// Set aside output for QC report instructions
|
||||
| map { id, state ->
|
||||
def new_state = state + ["output_reporting_json": "reporting_json.json"]
|
||||
[id, new_state]
|
||||
}
|
||||
|
||||
// generate qc json
|
||||
| h5mu_to_qc_json.run(
|
||||
fromState: [
|
||||
input: "input",
|
||||
ingestion_method: "ingestion_method",
|
||||
obs_metadata: "obs_metadata",
|
||||
min_total_counts: "min_total_counts",
|
||||
min_num_nonzero_vars: "min_num_nonzero_vars"
|
||||
],
|
||||
args: [
|
||||
obs_sample_id: "sample_id",
|
||||
obs_total_counts: "total_counts",
|
||||
obs_num_nonzero_vars: "num_nonzero_vars",
|
||||
obs_fraction_mitochondrial: "fraction_mitochondrial",
|
||||
obs_fraction_ribosomal: "fraction_ribosomal",
|
||||
],
|
||||
toState: [
|
||||
output: "output",
|
||||
output_reporting_json: "output_reporting_json"
|
||||
]
|
||||
)
|
||||
|
||||
// generate html report
|
||||
| generate_html.run(
|
||||
fromState: [
|
||||
input_data: "output",
|
||||
input_structure: "output_reporting_json"
|
||||
],
|
||||
toState: [
|
||||
output_qc_report: "output_qc_report"
|
||||
]
|
||||
)
|
||||
|
||||
// collect the reports into a single channel
|
||||
| joinStates { ids, states ->
|
||||
def newId = "qc_report"
|
||||
def report_state = [
|
||||
output_qc_report: states.collect{it.output_qc_report},
|
||||
_meta: states[0]._meta
|
||||
]
|
||||
[ newId, report_state ]
|
||||
}
|
||||
|
||||
output_ch = report_ch.mix(processed_files_ch)
|
||||
|
||||
| joinStates { ids, states ->
|
||||
|
||||
assert states.size() == 2, "Expected 2 states, but got ${states.size()}"
|
||||
assert ids.contains('qc_report'), "Expected one channel to have the id `qc_report`, but got ${ids}"
|
||||
assert ids.contains('qc_data'), "Expected one channel to have the id `qc_data`, but got ${ids}"
|
||||
|
||||
def newId = "combined"
|
||||
def combined_state = states[0] + states [1]
|
||||
|
||||
[ newId, combined_state ]
|
||||
}
|
||||
|
||||
emit: output_ch
|
||||
}
|
||||
10
src/workflows/generate_qc_report/nextflow.config
Normal file
10
src/workflows/generate_qc_report/nextflow.config
Normal file
@@ -0,0 +1,10 @@
|
||||
manifest {
|
||||
nextflowVersion = '!>=20.12.1-edge'
|
||||
}
|
||||
|
||||
params {
|
||||
rootDir = java.nio.file.Paths.get("$projectDir/../../../").toAbsolutePath().normalize().toString()
|
||||
}
|
||||
|
||||
// include common settings
|
||||
includeConfig("${params.rootDir}/src/configs/labels.config")
|
||||
224
src/workflows/generate_qc_report/test.nf
Normal file
224
src/workflows/generate_qc_report/test.nf
Normal file
@@ -0,0 +1,224 @@
|
||||
nextflow.enable.dsl=2
|
||||
targetDir = params.rootDir + "/target/nextflow/workflows"
|
||||
|
||||
include { generate_qc_report } from targetDir + "/generate_qc_report/main.nf"
|
||||
|
||||
params.resources_test = "s3://openpipelines-bio/openpipeline_incubator/resources_test/"
|
||||
|
||||
workflow test_no_cellbender {
|
||||
|
||||
resources_test_file = file(params.resources_test)
|
||||
|
||||
output_ch = Channel.fromList([
|
||||
[
|
||||
id: "sample_1",
|
||||
input: resources_test_file.resolve("qc_sample_data/sample_one.qc.h5mu"),
|
||||
run_cellbender: false,
|
||||
ingestion_method: "cellranger_multi",
|
||||
var_gene_names: "gene_symbol",
|
||||
metadata_obs_keys: ["donor_id", "cell_type", "batch", "condition"],
|
||||
output_html: "report.html",
|
||||
publish_dir: "test_out"
|
||||
],
|
||||
[
|
||||
id: "sample_2",
|
||||
input: resources_test_file.resolve("qc_sample_data/sample_two.qc.h5mu"),
|
||||
ingestion_method: "cellranger_multi",
|
||||
var_gene_names: "gene_symbol",
|
||||
run_cellbender: false,
|
||||
metadata_obs_keys: ["donor_id", "cell_type", "batch", "condition"],
|
||||
output_html: "report.html",
|
||||
publish_dir: "test_out"
|
||||
]
|
||||
])
|
||||
|
||||
| map{ state -> [state.id, state] }
|
||||
| generate_qc_report
|
||||
|
||||
| view { output ->
|
||||
assert output.size() == 2 : "Outputs should contain two elements; [id, state]"
|
||||
def id = output[0]
|
||||
def state = output [1]
|
||||
assert id == "combined": "Output ID should be `combined`"
|
||||
assert state instanceof Map : "State should be a map. Found: ${state}"
|
||||
assert state.containsKey("output_qc_report"): "Output should contain key `output_qc_report`"
|
||||
assert state.containsKey("output_processed_h5mu"): "Output should contain key `output_processed_h5mu`"
|
||||
assert state.output_qc_report.size() == 1 : "Expected exactly one output HTML file to be generated"
|
||||
assert state.output_qc_report.every { it.isFile()} : "All output HTML report file should exist"
|
||||
assert state.output_processed_h5mu.isDirectory() : "Output directory should exist"
|
||||
def files = state.output_processed_h5mu.listFiles().findAll { it.isFile() }
|
||||
assert files.size() == 2 : "Output directory should contain exactly 2 files, but found ${files.size()} files"
|
||||
"Output: $output"
|
||||
}
|
||||
}
|
||||
|
||||
workflow test_xenium {
|
||||
|
||||
resources_test_file = file(params.resources_test)
|
||||
|
||||
output_ch = Channel.fromList([
|
||||
[
|
||||
id: "sample_one",
|
||||
input: resources_test_file.resolve("spatial_qc_sample_data/xenium_tiny.qc.h5mu"),
|
||||
run_cellbender: false,
|
||||
ingestion_method: "xenium",
|
||||
var_gene_names: "gene_ids",
|
||||
min_num_nonzero_vars: "1",
|
||||
output_html: "report.html",
|
||||
publish_dir: "test_out"
|
||||
],
|
||||
[
|
||||
id: "sample_two",
|
||||
input: resources_test_file.resolve("spatial_qc_sample_data/xenium_tiny.qc.h5mu"),
|
||||
ingestion_method: "xenium",
|
||||
var_gene_names: "gene_ids",
|
||||
min_num_nonzero_vars: "1",
|
||||
run_cellbender: false,
|
||||
output_html: "report.html",
|
||||
publish_dir: "test_out"
|
||||
]
|
||||
])
|
||||
|
||||
| map{ state -> [state.id, state] }
|
||||
| generate_qc_report
|
||||
|
||||
| view { output ->
|
||||
assert output.size() == 2 : "Outputs should contain two elements; [id, state]"
|
||||
def id = output[0]
|
||||
def state = output [1]
|
||||
assert id == "combined": "Output ID should be `combined`"
|
||||
assert state instanceof Map : "State should be a map. Found: ${state}"
|
||||
assert state.containsKey("output_qc_report"): "Output should contain key `output_qc_report`"
|
||||
assert state.containsKey("output_processed_h5mu"): "Output should contain key `output_processed_h5mu`"
|
||||
assert state.output_qc_report.size() == 1 : "Expected exactly one output HTML file to be generated"
|
||||
assert state.output_qc_report.every { it.isFile()} : "All output HTML report file should exist"
|
||||
assert state.output_processed_h5mu.isDirectory() : "Output directory should exist"
|
||||
def files = state.output_processed_h5mu.listFiles().findAll { it.isFile() }
|
||||
assert files.size() == 2 : "Output directory should contain exactly 2 files, but found ${files.size()} files"
|
||||
"Output: $output"
|
||||
}
|
||||
}
|
||||
|
||||
workflow test_with_cellbender {
|
||||
|
||||
resources_test_file = file(params.resources_test)
|
||||
|
||||
output_ch = Channel.fromList([
|
||||
[
|
||||
id: "sample_one",
|
||||
input: resources_test_file.resolve("qc_sample_data/sample_one.qc.h5mu"),
|
||||
ingestion_method: "cellranger_multi",
|
||||
var_gene_names: "gene_symbol",
|
||||
run_cellbender: true,
|
||||
cellbender_epochs: 1,
|
||||
output_html: "report.html",
|
||||
publish_dir: "test_out"
|
||||
],
|
||||
[
|
||||
id: "sample_two",
|
||||
input: resources_test_file.resolve("qc_sample_data/sample_two.qc.h5mu"),
|
||||
ingestion_method: "cellranger_multi",
|
||||
var_gene_names: "gene_symbol",
|
||||
run_cellbender: true,
|
||||
cellbender_epochs: 1,
|
||||
output_html: "report.html",
|
||||
publish_dir: "test_out"
|
||||
]
|
||||
])
|
||||
|
||||
| map{ state -> [state.id, state] }
|
||||
| generate_qc_report
|
||||
|
||||
| view { output ->
|
||||
assert output.size() == 2 : "Outputs should contain two elements; [id, state]"
|
||||
def id = output[0]
|
||||
def state = output [1]
|
||||
assert id == "combined": "Output ID should be `combined`"
|
||||
assert state instanceof Map : "State should be a map. Found: ${state}"
|
||||
assert state.containsKey("output_qc_report"): "Output should contain key `output_qc_report`"
|
||||
assert state.containsKey("output_processed_h5mu"): "Output should contain key `output_processed_h5mu`"
|
||||
assert state.output_qc_report.size() == 1 : "Expected exactly one output HTML file to be generated"
|
||||
assert state.output_qc_report.every { it.isFile()} : "All output HTML report file should exist"
|
||||
assert state.output_processed_h5mu.isDirectory() : "Output directory should exist"
|
||||
def files = state.output_processed_h5mu.listFiles().findAll { it.isFile() }
|
||||
assert files.size() == 2 : "Output directory should contain exactly 2 files, but found ${files.size()} files"
|
||||
"Output: $output"
|
||||
}
|
||||
}
|
||||
|
||||
workflow test_multiple_reports {
|
||||
|
||||
resources_test_file = file(params.resources_test)
|
||||
|
||||
output_ch = Channel.fromList([
|
||||
[
|
||||
id: "sample_1",
|
||||
input: resources_test_file.resolve("qc_sample_data/sample_one.qc.h5mu"),
|
||||
ingestion_method: "cellranger_multi",
|
||||
run_cellbender: false,
|
||||
metadata_obs_keys: ["donor_id", "cell_type", "batch", "condition"],
|
||||
output_html: "report.html",
|
||||
max_samples_per_report: 2,
|
||||
publish_dir: "test_out"
|
||||
],
|
||||
[
|
||||
id: "sample_2",
|
||||
input: resources_test_file.resolve("qc_sample_data/sample_two.qc.h5mu"),
|
||||
ingestion_method: "cellranger_multi",
|
||||
run_cellbender: false,
|
||||
metadata_obs_keys: ["donor_id", "cell_type", "batch", "condition"],
|
||||
output_html: "report.html",
|
||||
max_samples_per_report: 2,
|
||||
publish_dir: "test_out"
|
||||
],
|
||||
[
|
||||
id: "sample_3",
|
||||
input: resources_test_file.resolve("qc_sample_data/sample_one.qc.h5mu"),
|
||||
ingestion_method: "cellranger_multi",
|
||||
run_cellbender: false,
|
||||
metadata_obs_keys: ["donor_id", "cell_type", "batch", "condition"],
|
||||
output_html: "report.html",
|
||||
max_samples_per_report: 2,
|
||||
publish_dir: "test_out"
|
||||
],
|
||||
[
|
||||
id: "sample_4",
|
||||
input: resources_test_file.resolve("qc_sample_data/sample_two.qc.h5mu"),
|
||||
ingestion_method: "cellranger_multi",
|
||||
run_cellbender: false,
|
||||
metadata_obs_keys: ["donor_id", "cell_type", "batch", "condition"],
|
||||
output_html: "report.html",
|
||||
max_samples_per_report: 2,
|
||||
publish_dir: "test_out"
|
||||
],
|
||||
[
|
||||
id: "sample_5",
|
||||
input: resources_test_file.resolve("qc_sample_data/sample_one.qc.h5mu"),
|
||||
ingestion_method: "cellranger_multi",
|
||||
run_cellbender: false,
|
||||
metadata_obs_keys: ["donor_id", "cell_type", "batch", "condition"],
|
||||
output_html: "report.html",
|
||||
max_samples_per_report: 2,
|
||||
publish_dir: "test_out"
|
||||
]
|
||||
])
|
||||
|
||||
| map{ state -> [state.id, state] }
|
||||
| generate_qc_report
|
||||
|
||||
| view { output ->
|
||||
assert output.size() == 2 : "Outputs should contain two elements; [id, state]"
|
||||
def id = output[0]
|
||||
def state = output [1]
|
||||
assert id == "combined": "Output ID should be `combined`"
|
||||
assert state instanceof Map : "State should be a map. Found: ${state}"
|
||||
assert state.containsKey("output_qc_report"): "Output should contain key `output_qc_report`"
|
||||
assert state.containsKey("output_processed_h5mu"): "Output should contain key `output_processed_h5mu`"
|
||||
assert state.output_qc_report.size() == 3 : "Expected exactly one output HTML file to be generated"
|
||||
assert state.output_qc_report.every { it.isFile()} : "All output HTML report file should exist"
|
||||
assert state.output_processed_h5mu.isDirectory() : "Output directory should exist"
|
||||
def files = state.output_processed_h5mu.listFiles().findAll { it.isFile() }
|
||||
assert files.size() == 5 : "Output directory should contain exactly 5 files, but found ${files.size()} files"
|
||||
"Output: $output"
|
||||
}
|
||||
}
|
||||
0
target/.build.yaml
Normal file
0
target/.build.yaml
Normal file
@@ -0,0 +1,484 @@
|
||||
name: "h5mu_to_qc_json"
|
||||
namespace: "ingestion_qc"
|
||||
version: "v0.1.0"
|
||||
authors:
|
||||
- name: "Jakub Majercik"
|
||||
roles:
|
||||
- "author"
|
||||
info:
|
||||
role: "Contributor"
|
||||
links:
|
||||
email: "jakub@data-intuitive.com"
|
||||
github: "jakubmajercik"
|
||||
linkedin: "jakubmajercik"
|
||||
organizations:
|
||||
- name: "Data Intuitive"
|
||||
href: "https://www.data-intuitive.com"
|
||||
role: "Bioinformatics Engineer"
|
||||
- name: "Dorien Roosen"
|
||||
roles:
|
||||
- "author"
|
||||
info:
|
||||
role: "Core Team Member"
|
||||
links:
|
||||
email: "dorien@data-intuitive.com"
|
||||
github: "dorien-er"
|
||||
linkedin: "dorien-roosen"
|
||||
organizations:
|
||||
- name: "Data Intuitive"
|
||||
href: "https://www.data-intuitive.com"
|
||||
role: "Data Scientist"
|
||||
- name: "Robrecht Cannoodt"
|
||||
roles:
|
||||
- "author"
|
||||
info:
|
||||
role: "Core Team Member"
|
||||
links:
|
||||
email: "robrecht@data-intuitive.com"
|
||||
github: "rcannood"
|
||||
orcid: "0000-0003-3641-729X"
|
||||
linkedin: "robrechtcannoodt"
|
||||
organizations:
|
||||
- name: "Data Intuitive"
|
||||
href: "https://www.data-intuitive.com"
|
||||
role: "Data Science Engineer"
|
||||
- name: "Open Problems"
|
||||
href: "https://openproblems.bio"
|
||||
role: "Core Member"
|
||||
- name: "Weiwei Schultz"
|
||||
roles:
|
||||
- "contributor"
|
||||
info:
|
||||
role: "Contributor"
|
||||
organizations:
|
||||
- name: "Janssen R&D US"
|
||||
role: "Associate Director Data Sciences"
|
||||
argument_groups:
|
||||
- name: "Inputs"
|
||||
arguments:
|
||||
- type: "file"
|
||||
name: "--input"
|
||||
description: "The input h5mu file(s)"
|
||||
info: null
|
||||
example:
|
||||
- "path/to/file.h5mu"
|
||||
must_exist: true
|
||||
create_parent: true
|
||||
required: true
|
||||
direction: "input"
|
||||
multiple: true
|
||||
multiple_sep: ";"
|
||||
- type: "string"
|
||||
name: "--modality"
|
||||
description: "The modality to use"
|
||||
info: null
|
||||
default:
|
||||
- "rna"
|
||||
required: false
|
||||
direction: "input"
|
||||
multiple: false
|
||||
multiple_sep: ";"
|
||||
- type: "string"
|
||||
name: "--ingestion_method"
|
||||
description: "Method that was used to ingest the data - this will define the structure\
|
||||
\ of the report that is generated."
|
||||
info: null
|
||||
required: true
|
||||
choices:
|
||||
- "cellranger_multi"
|
||||
- "xenium"
|
||||
direction: "input"
|
||||
multiple: false
|
||||
multiple_sep: ";"
|
||||
- type: "string"
|
||||
name: "--obs_sample_id"
|
||||
description: "The key in the h5mu file that contains the sample ID. If not provided,\
|
||||
\ each H5MU file will be considered as a separate sample."
|
||||
info: null
|
||||
default:
|
||||
- "sample_id"
|
||||
required: false
|
||||
direction: "input"
|
||||
multiple: false
|
||||
multiple_sep: ";"
|
||||
- type: "string"
|
||||
name: "--obs_total_counts"
|
||||
description: "The key in the h5mu .obs field that contains the total counts."
|
||||
info: null
|
||||
default:
|
||||
- "total_counts"
|
||||
required: false
|
||||
direction: "input"
|
||||
multiple: false
|
||||
multiple_sep: ";"
|
||||
- type: "string"
|
||||
name: "--obs_num_nonzero_vars"
|
||||
description: "The key in the h5mu .obs field that contains the number of nonzero\
|
||||
\ vars."
|
||||
info: null
|
||||
default:
|
||||
- "num_nonzero_vars"
|
||||
required: false
|
||||
direction: "input"
|
||||
multiple: false
|
||||
multiple_sep: ";"
|
||||
- type: "string"
|
||||
name: "--obs_fraction_mitochondrial"
|
||||
description: "The key in the h5mu .obs field that contains the fraction mitochondrial\
|
||||
\ genes."
|
||||
info: null
|
||||
default:
|
||||
- "fraction_mitochondrial"
|
||||
required: false
|
||||
direction: "input"
|
||||
multiple: false
|
||||
multiple_sep: ";"
|
||||
- type: "string"
|
||||
name: "--obs_fraction_ribosomal"
|
||||
description: "The key in the h5mu .obs field that contains the fraction ribosomal\
|
||||
\ genes."
|
||||
info: null
|
||||
default:
|
||||
- "fraction_ribosomal"
|
||||
required: false
|
||||
direction: "input"
|
||||
multiple: false
|
||||
multiple_sep: ";"
|
||||
- name: "Outputs"
|
||||
arguments:
|
||||
- type: "file"
|
||||
name: "--output"
|
||||
description: "The output JSON file"
|
||||
info: null
|
||||
example:
|
||||
- "path/to/file.json"
|
||||
must_exist: true
|
||||
create_parent: true
|
||||
required: true
|
||||
direction: "output"
|
||||
multiple: false
|
||||
multiple_sep: ";"
|
||||
- type: "file"
|
||||
name: "--output_reporting_json"
|
||||
description: "The output JSON file that defines the QC report"
|
||||
info: null
|
||||
example:
|
||||
- "path/to/file.json"
|
||||
must_exist: true
|
||||
create_parent: true
|
||||
required: true
|
||||
direction: "output"
|
||||
multiple: false
|
||||
multiple_sep: ";"
|
||||
- name: "Filtering & grouping options"
|
||||
arguments:
|
||||
- type: "integer"
|
||||
name: "--min_total_counts"
|
||||
description: "Minimum total counts for a cell to be included in the output"
|
||||
info: null
|
||||
default:
|
||||
- 10
|
||||
required: false
|
||||
direction: "input"
|
||||
multiple: false
|
||||
multiple_sep: ";"
|
||||
- type: "integer"
|
||||
name: "--min_num_nonzero_vars"
|
||||
description: "Minimum number of nonzero vars for a cell to be included in the\
|
||||
\ output"
|
||||
info: null
|
||||
default:
|
||||
- 10
|
||||
required: false
|
||||
direction: "input"
|
||||
multiple: false
|
||||
multiple_sep: ";"
|
||||
- type: "string"
|
||||
name: "--obs_metadata"
|
||||
description: "The metadata keys in the h5mu .obs to include in the output JSON."
|
||||
info: null
|
||||
example:
|
||||
- "donor_id;cell_type;batch;condition"
|
||||
required: false
|
||||
direction: "input"
|
||||
multiple: true
|
||||
multiple_sep: ";"
|
||||
- name: "Options for CellRanger reports"
|
||||
arguments:
|
||||
- type: "string"
|
||||
name: "--obs_cellbender"
|
||||
description: "The cellbender keys in the h5mu .obs to include in the output JSON"
|
||||
info: null
|
||||
default:
|
||||
- "cellbender_background_fraction"
|
||||
- "cellbender_cell_probability"
|
||||
- "cellbender_cell_size"
|
||||
- "cellbender_droplet_efficiency"
|
||||
required: false
|
||||
direction: "input"
|
||||
multiple: true
|
||||
multiple_sep: ";"
|
||||
- type: "string"
|
||||
name: "--uns_cellranger_metrics"
|
||||
description: "The key in the h5mu file .uns that contains the cellranger metrics"
|
||||
info: null
|
||||
default:
|
||||
- "metrics_cellranger"
|
||||
required: false
|
||||
direction: "input"
|
||||
multiple: false
|
||||
multiple_sep: ";"
|
||||
- name: "Options for Xenium reports"
|
||||
arguments:
|
||||
- type: "string"
|
||||
name: "--obs_nucleus_area"
|
||||
description: "The key in the h5mu .obs field that contains the nucleus area."
|
||||
info: null
|
||||
default:
|
||||
- "nucleus_area"
|
||||
required: false
|
||||
direction: "input"
|
||||
multiple: false
|
||||
multiple_sep: ";"
|
||||
- type: "string"
|
||||
name: "--obs_cell_area"
|
||||
description: "The key in the h5mu .obs field that contains the cell area."
|
||||
info: null
|
||||
default:
|
||||
- "cell_area"
|
||||
required: false
|
||||
direction: "input"
|
||||
multiple: false
|
||||
multiple_sep: ";"
|
||||
- type: "string"
|
||||
name: "--obs_x_coord"
|
||||
description: "The key in the h5mu .obs field that contains the x coordinate."
|
||||
info: null
|
||||
default:
|
||||
- "x_coord"
|
||||
required: false
|
||||
direction: "input"
|
||||
multiple: false
|
||||
multiple_sep: ";"
|
||||
- type: "string"
|
||||
name: "--obs_y_coord"
|
||||
description: "The key in the h5mu .obs field that contains the y coordinate."
|
||||
info: null
|
||||
default:
|
||||
- "y_coord"
|
||||
required: false
|
||||
direction: "input"
|
||||
multiple: false
|
||||
multiple_sep: ";"
|
||||
- type: "string"
|
||||
name: "--obs_control_probe_counts"
|
||||
description: "The key in the h5mu .obs field that contains the number of control\
|
||||
\ probes."
|
||||
info: null
|
||||
default:
|
||||
- "control_probe_counts"
|
||||
required: false
|
||||
direction: "input"
|
||||
multiple: false
|
||||
multiple_sep: ";"
|
||||
- type: "string"
|
||||
name: "--obs_control_codeword_counts"
|
||||
description: "The key in the h5mu .obs field that contains the number of control\
|
||||
\ codewords."
|
||||
info: null
|
||||
default:
|
||||
- "control_codeword_counts"
|
||||
required: false
|
||||
direction: "input"
|
||||
multiple: false
|
||||
multiple_sep: ";"
|
||||
resources:
|
||||
- type: "python_script"
|
||||
path: "script.py"
|
||||
is_executable: true
|
||||
- type: "file"
|
||||
path: "setup_logger.py"
|
||||
- type: "file"
|
||||
path: "report_structure"
|
||||
- type: "file"
|
||||
path: "nextflow_labels.config"
|
||||
dest: "nextflow_labels.config"
|
||||
description: "Takes H5MU files that have been ingested by CellRanger, Xenium or CosMx\
|
||||
\ and processed by the QC workflow, and generates:\n- A JSON file that contains\
|
||||
\ the combined data for the QC report\n- A JSON file that defines the layout and\
|
||||
\ structure of the QC report\n"
|
||||
test_resources:
|
||||
- type: "python_script"
|
||||
path: "test.py"
|
||||
is_executable: true
|
||||
- type: "file"
|
||||
path: "resources_test"
|
||||
info: null
|
||||
status: "enabled"
|
||||
scope:
|
||||
image: "private"
|
||||
target: "private"
|
||||
requirements:
|
||||
commands:
|
||||
- "ps"
|
||||
repositories:
|
||||
- type: "github"
|
||||
name: "openpipeline"
|
||||
repo: "openpipelines-bio/openpipeline"
|
||||
tag: "2.1.2"
|
||||
- type: "vsh"
|
||||
name: "craftbox"
|
||||
repo: "craftbox"
|
||||
tag: "v0.2.0"
|
||||
links:
|
||||
repository: "https://github.com/openpipelines-bio/openpipeline_qc"
|
||||
docker_registry: "ghcr.io"
|
||||
runners:
|
||||
- type: "executable"
|
||||
id: "executable"
|
||||
docker_setup_strategy: "ifneedbepullelsecachedbuild"
|
||||
- type: "nextflow"
|
||||
id: "nextflow"
|
||||
directives:
|
||||
label:
|
||||
- "midmem"
|
||||
- "middisk"
|
||||
tag: "$id"
|
||||
auto:
|
||||
simplifyInput: true
|
||||
simplifyOutput: false
|
||||
transcript: false
|
||||
publish: false
|
||||
config:
|
||||
labels:
|
||||
mem1gb: "memory = 1000000000.B"
|
||||
mem2gb: "memory = 2000000000.B"
|
||||
mem5gb: "memory = 5000000000.B"
|
||||
mem10gb: "memory = 10000000000.B"
|
||||
mem20gb: "memory = 20000000000.B"
|
||||
mem50gb: "memory = 50000000000.B"
|
||||
mem100gb: "memory = 100000000000.B"
|
||||
mem200gb: "memory = 200000000000.B"
|
||||
mem500gb: "memory = 500000000000.B"
|
||||
mem1tb: "memory = 1000000000000.B"
|
||||
mem2tb: "memory = 2000000000000.B"
|
||||
mem5tb: "memory = 5000000000000.B"
|
||||
mem10tb: "memory = 10000000000000.B"
|
||||
mem20tb: "memory = 20000000000000.B"
|
||||
mem50tb: "memory = 50000000000000.B"
|
||||
mem100tb: "memory = 100000000000000.B"
|
||||
mem200tb: "memory = 200000000000000.B"
|
||||
mem500tb: "memory = 500000000000000.B"
|
||||
mem1gib: "memory = 1073741824.B"
|
||||
mem2gib: "memory = 2147483648.B"
|
||||
mem4gib: "memory = 4294967296.B"
|
||||
mem8gib: "memory = 8589934592.B"
|
||||
mem16gib: "memory = 17179869184.B"
|
||||
mem32gib: "memory = 34359738368.B"
|
||||
mem64gib: "memory = 68719476736.B"
|
||||
mem128gib: "memory = 137438953472.B"
|
||||
mem256gib: "memory = 274877906944.B"
|
||||
mem512gib: "memory = 549755813888.B"
|
||||
mem1tib: "memory = 1099511627776.B"
|
||||
mem2tib: "memory = 2199023255552.B"
|
||||
mem4tib: "memory = 4398046511104.B"
|
||||
mem8tib: "memory = 8796093022208.B"
|
||||
mem16tib: "memory = 17592186044416.B"
|
||||
mem32tib: "memory = 35184372088832.B"
|
||||
mem64tib: "memory = 70368744177664.B"
|
||||
mem128tib: "memory = 140737488355328.B"
|
||||
mem256tib: "memory = 281474976710656.B"
|
||||
mem512tib: "memory = 562949953421312.B"
|
||||
cpu1: "cpus = 1"
|
||||
cpu2: "cpus = 2"
|
||||
cpu5: "cpus = 5"
|
||||
cpu10: "cpus = 10"
|
||||
cpu20: "cpus = 20"
|
||||
cpu50: "cpus = 50"
|
||||
cpu100: "cpus = 100"
|
||||
cpu200: "cpus = 200"
|
||||
cpu500: "cpus = 500"
|
||||
cpu1000: "cpus = 1000"
|
||||
script:
|
||||
- "includeConfig(\"nextflow_labels.config\")"
|
||||
debug: false
|
||||
container: "docker"
|
||||
engines:
|
||||
- type: "docker"
|
||||
id: "docker"
|
||||
image: "python:3.12-slim"
|
||||
target_registry: "images.viash-hub.com"
|
||||
target_tag: "v0.1.0"
|
||||
namespace_separator: "/"
|
||||
setup:
|
||||
- type: "apt"
|
||||
packages:
|
||||
- "procps"
|
||||
interactive: false
|
||||
- type: "python"
|
||||
user: false
|
||||
packages:
|
||||
- "anndata~=0.11.1"
|
||||
- "mudata~=0.3.1"
|
||||
script:
|
||||
- "exec(\"try:\\n import awkward\\nexcept ModuleNotFoundError:\\n exit(0)\\\
|
||||
nelse: exit(1)\")"
|
||||
upgrade: true
|
||||
test_setup:
|
||||
- type: "apt"
|
||||
packages:
|
||||
- "git"
|
||||
interactive: false
|
||||
- type: "python"
|
||||
user: false
|
||||
packages:
|
||||
- "viashpy==0.8.0"
|
||||
github:
|
||||
- "openpipelines-bio/core#subdirectory=packages/python/openpipeline_testutils"
|
||||
upgrade: true
|
||||
entrypoint: []
|
||||
cmd: null
|
||||
- type: "native"
|
||||
id: "native"
|
||||
build_info:
|
||||
config: "src/ingestion_qc/h5mu_to_qc_json/config.vsh.yaml"
|
||||
runner: "executable"
|
||||
engine: "docker|native"
|
||||
output: "target/_private/executable/ingestion_qc/h5mu_to_qc_json"
|
||||
executable: "target/_private/executable/ingestion_qc/h5mu_to_qc_json/h5mu_to_qc_json"
|
||||
viash_version: "0.9.4"
|
||||
git_commit: "4de00a2614069bdaee27943e73a51d378e465c60"
|
||||
git_remote: "https://github.com/openpipelines-bio/openpipeline_qc"
|
||||
git_tag: "v0.1.0"
|
||||
package_config:
|
||||
name: "openpipeline_qc"
|
||||
version: "v0.1.0"
|
||||
info:
|
||||
test_resources:
|
||||
- type: "s3"
|
||||
path: "s3://openpipelines-bio/openpipeline_incubator/resources_test"
|
||||
dest: "resources_test"
|
||||
repositories:
|
||||
- type: "github"
|
||||
name: "openpipeline"
|
||||
repo: "openpipelines-bio/openpipeline"
|
||||
tag: "2.1.2"
|
||||
- type: "vsh"
|
||||
name: "craftbox"
|
||||
repo: "craftbox"
|
||||
tag: "v0.2.0"
|
||||
viash_version: "0.9.4"
|
||||
source: "src"
|
||||
target: "target"
|
||||
config_mods:
|
||||
- ".requirements.commands := ['ps']\n.runners[.type == 'nextflow'].directives.tag\
|
||||
\ := '$id'\n.resources += {path: '/src/configs/labels.config', dest: 'nextflow_labels.config'}\n\
|
||||
.runners[.type == 'nextflow'].config.script := 'includeConfig(\"nextflow_labels.config\"\
|
||||
)'"
|
||||
- ".engines += { type: \"native\" }"
|
||||
- ".engines[.type == 'docker'].target_registry := 'images.viash-hub.com'"
|
||||
- ".engines[.type == 'docker'].target_tag := 'v0.1.0'"
|
||||
organization: "vsh"
|
||||
links:
|
||||
repository: "https://github.com/openpipelines-bio/openpipeline_qc"
|
||||
docker_registry: "ghcr.io"
|
||||
1939
target/_private/executable/ingestion_qc/h5mu_to_qc_json/h5mu_to_qc_json
Executable file
1939
target/_private/executable/ingestion_qc/h5mu_to_qc_json/h5mu_to_qc_json
Executable file
File diff suppressed because it is too large
Load Diff
@@ -0,0 +1,66 @@
|
||||
process {
|
||||
// Default resources for components that hardly do any processing
|
||||
memory = { 2.GB * task.attempt }
|
||||
cpus = 1
|
||||
|
||||
// Retry for exit codes that have something to do with memory issues
|
||||
errorStrategy = { task.exitStatus in 137..140 ? 'retry' : 'terminate' }
|
||||
maxRetries = 3
|
||||
maxMemory = null
|
||||
|
||||
// CPU resources
|
||||
withLabel: singlecpu { cpus = 1 }
|
||||
withLabel: lowcpu { cpus = 4 }
|
||||
withLabel: midcpu { cpus = 10 }
|
||||
withLabel: highcpu { cpus = 20 }
|
||||
|
||||
// Memory resources
|
||||
withLabel: verylowmem { memory = { get_memory( 4.GB * task.attempt ) } }
|
||||
withLabel: lowmem { memory = { get_memory( 8.GB * task.attempt ) } }
|
||||
withLabel: midmem { memory = { get_memory( 16.GB * task.attempt ) } }
|
||||
withLabel: highmem { memory = { get_memory( 64.GB * task.attempt ) } }
|
||||
withLabel: veryhighmem { memory = { get_memory( 75.GB * task.attempt ) } }
|
||||
|
||||
// Disk space
|
||||
withLabel: lowdisk {
|
||||
disk = {process.disk ? process.disk : null}
|
||||
}
|
||||
withLabel: middisk {
|
||||
disk = {process.disk ? process.disk : null}
|
||||
}
|
||||
withLabel: highdisk {
|
||||
disk = {process.disk ? process.disk : null}
|
||||
}
|
||||
withLabel: veryhighdisk {
|
||||
disk = {process.disk ? process.disk : null}
|
||||
}
|
||||
// NOTE: The above labels intentionally do not have an effect by default.
|
||||
// The user should set the disk space requirements by adding the following
|
||||
// to the compute environment:
|
||||
//
|
||||
// withLabel: lowdisk { disk = { 20.GB * task.attempt } }
|
||||
// withLabel: middisk { disk = { 100.GB * task.attempt } }
|
||||
// withLabel: highdisk { disk = { 200.GB * task.attempt } }
|
||||
// withLabel: veryhighdisk { disk = { 500.GB * task.attempt } }
|
||||
}
|
||||
|
||||
def get_memory(to_compare) {
|
||||
if (!process.containsKey("maxMemory") || !process.maxMemory) {
|
||||
return to_compare
|
||||
}
|
||||
|
||||
try {
|
||||
if (process.containsKey("maxRetries") && process.maxRetries && task.attempt == (process.maxRetries as int)) {
|
||||
return process.maxMemory
|
||||
}
|
||||
else if (to_compare.compareTo(process.maxMemory as nextflow.util.MemoryUnit) == 1) {
|
||||
return max_memory as nextflow.util.MemoryUnit
|
||||
}
|
||||
else {
|
||||
return to_compare
|
||||
}
|
||||
} catch (all) {
|
||||
println "Error processing memory resources. Please check that process.maxMemory '${process.maxMemory}' and process.maxRetries '${process.maxRetries}' are valid!"
|
||||
System.exit(1)
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,162 @@
|
||||
{
|
||||
"categories": [
|
||||
{
|
||||
"name": "Sample QC",
|
||||
"key": "sample_summary_stats",
|
||||
"additionalAxes": false,
|
||||
"defaultFilters": []
|
||||
},
|
||||
{
|
||||
"name": "SampleQC",
|
||||
"key": "metrics_cellranger_stats",
|
||||
"additionalAxes": false,
|
||||
"defaultFilters": [
|
||||
{
|
||||
"type": "bar",
|
||||
"field": "Number_of_reads_in_the_library",
|
||||
"label": "Number of reads per library",
|
||||
"description": "Sequencing depth per sample. Higher values generally indicate more comprehensive cell profiling.",
|
||||
"nBins": 10,
|
||||
"groupBy": "sample_id",
|
||||
"xAxisType": "linear",
|
||||
"yAxisType": "linear"
|
||||
},
|
||||
{
|
||||
"type": "bar",
|
||||
"field": "Confidently_mapped_reads_in_cells",
|
||||
"label": "Confidently mapped reads in cells",
|
||||
"description": "Number of reads that were mapped unambiguously to the reference genome within cell-containing droplets.",
|
||||
"groupBy": "sample_id",
|
||||
"nBins": 10,
|
||||
"yAxisType": "linear"
|
||||
},
|
||||
{
|
||||
"type": "bar",
|
||||
"field": "Estimated_number_of_cells",
|
||||
"label": "Estimated number of cells",
|
||||
"description": "CellRanger's estimate of the number of cells per sample based on the UMI count distribution.",
|
||||
"groupBy": "sample_id",
|
||||
"nBins": 10,
|
||||
"yAxisType": "linear"
|
||||
},
|
||||
{
|
||||
"type": "bar",
|
||||
"field": "Sequencing_saturation",
|
||||
"label": "Sequencing saturation",
|
||||
"description": "Fraction of reads that are duplicates of existing UMIs. Higher values suggest deeper sequencing coverage.",
|
||||
"groupBy": "sample_id",
|
||||
"nBins": 10,
|
||||
"yAxisType": "linear"
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"name": "Cell RNA QC",
|
||||
"key": "cell_rna_stats",
|
||||
"additionalAxes": true,
|
||||
"defaultFilters": [
|
||||
{
|
||||
"type": "histogram",
|
||||
"field": "total_counts",
|
||||
"label": "Total UMI per cell",
|
||||
"description": "Total number of RNA molecules detected per cell. Low values typically indicate empty droplets or low-quality cells that should be filtered out.",
|
||||
"cutoffMin": null,
|
||||
"cutoffMax": null,
|
||||
"zoomMax": null,
|
||||
"nBins": 50,
|
||||
"groupBy": "sample_id",
|
||||
"yAxisType": "linear"
|
||||
},
|
||||
{
|
||||
"type": "histogram",
|
||||
"field": "num_nonzero_vars",
|
||||
"label": "Number of non-zero genes per cell",
|
||||
"description": "Count of unique genes detected in each cell. Low gene counts often indicate poor-quality cells.",
|
||||
"cutoffMin": null,
|
||||
"cutoffMax": null,
|
||||
"zoomMax": null,
|
||||
"nBins": 50,
|
||||
"groupBy": "sample_id",
|
||||
"yAxisType": "linear"
|
||||
},
|
||||
{
|
||||
"type": "histogram",
|
||||
"field": "fraction_mitochondrial",
|
||||
"label": "Fraction UMI of mitochondrial genes per cell",
|
||||
"description": "Proportion of cell's RNA from mitochondrial genes.",
|
||||
"cutoffMin": null,
|
||||
"cutoffMax": null,
|
||||
"nBins": 50,
|
||||
"groupBy": "sample_id",
|
||||
"yAxisType": "linear"
|
||||
},
|
||||
{
|
||||
"type": "histogram",
|
||||
"field": "fraction_ribosomal",
|
||||
"label": "Fraction UMI of ribosomal genes per cell",
|
||||
"description": "Proportion of cell's RNA from ribosomal protein genes. Extreme values may indicate stress responses or cell cycle abnormalities.",
|
||||
"cutoffMin": null,
|
||||
"cutoffMax": null,
|
||||
"nBins": 50,
|
||||
"groupBy": "sample_id",
|
||||
"yAxisType": "linear"
|
||||
},
|
||||
{
|
||||
"type": "histogram",
|
||||
"field": "pct_of_counts_in_top_50_vars",
|
||||
"label": "Fraction UMI in top 50 genes per cell",
|
||||
"description": "Proportion of RNA molecules from the 50 most-expressed genes in each cell.",
|
||||
"cutoffMin": null,
|
||||
"cutoffMax": null,
|
||||
"nBins": 50,
|
||||
"groupBy": "sample_id",
|
||||
"yAxisType": "linear"
|
||||
},
|
||||
{
|
||||
"type": "histogram",
|
||||
"field": "cellbender_cell_probability",
|
||||
"label": "CellBender cell probability",
|
||||
"description": "CellBender's statistical confidence (0-1) that a barcode represents a real cell, with higher values indicating stronger confidence.",
|
||||
"cutoffMin": null,
|
||||
"cutoffMax": null,
|
||||
"nBins": 50,
|
||||
"groupBy": "sample_id",
|
||||
"yAxisType": "linear"
|
||||
},
|
||||
{
|
||||
"type": "histogram",
|
||||
"field": "cellbender_background_fraction",
|
||||
"label": "CellBender background fraction",
|
||||
"description": "Estimated percentage of each cell's RNA that comes from the ambient solution rather than the cell itself.",
|
||||
"cutoffMin": null,
|
||||
"cutoffMax": null,
|
||||
"nBins": 50,
|
||||
"groupBy": "sample_id",
|
||||
"yAxisType": "linear"
|
||||
},
|
||||
{
|
||||
"type": "histogram",
|
||||
"field": "cellbender_cell_size",
|
||||
"label": "CellBender cell size",
|
||||
"description": "CellBender's estimate of the true number of RNA molecules in each cell after removing ambient contamination. Reflects actual cell RNA content rather than raw UMI counts.",
|
||||
"cutoffMin": null,
|
||||
"cutoffMax": null,
|
||||
"nBins": 50,
|
||||
"groupBy": "sample_id",
|
||||
"yAxisType": "linear"
|
||||
},
|
||||
{
|
||||
"type": "histogram",
|
||||
"field": "cellbender_droplet_efficiency",
|
||||
"label": "CellBender droplet efficiency",
|
||||
"description": "CellBender's estimate of how efficiently each droplet captured RNA molecules. Higher values indicate more reliable RNA sampling within individual droplets.",
|
||||
"cutoffMin": null,
|
||||
"cutoffMax": null,
|
||||
"nBins": 50,
|
||||
"groupBy": "sample_id",
|
||||
"yAxisType": "linear"
|
||||
}
|
||||
]
|
||||
}
|
||||
]
|
||||
}
|
||||
@@ -0,0 +1,91 @@
|
||||
{
|
||||
"categories": [
|
||||
{
|
||||
"name": "Sample QC",
|
||||
"key": "sample_summary_stats",
|
||||
"additionalAxes": false,
|
||||
"defaultFilters": []
|
||||
},
|
||||
{
|
||||
"name": "Cell RNA QC",
|
||||
"key": "cell_rna_stats",
|
||||
"additionalAxes": true,
|
||||
"defaultFilters": [
|
||||
{
|
||||
"type": "histogram",
|
||||
"visualizationType": "histogram",
|
||||
"field": "total_counts",
|
||||
"label": "Total UMI per cell",
|
||||
"description": "Total number of RNA molecules detected per cell. Low values typically indicate empty droplets or low-quality cells that should be filtered out.",
|
||||
"cutoffMin": null,
|
||||
"cutoffMax": null,
|
||||
"zoomMax": null,
|
||||
"nBins": 50,
|
||||
"groupBy": "sample_id",
|
||||
"yAxisType": "linear"
|
||||
},
|
||||
{
|
||||
"type": "histogram",
|
||||
"visualizationType": "histogram",
|
||||
"field": "num_nonzero_vars",
|
||||
"label": "Number of non-zero genes per cell",
|
||||
"description": "Count of unique genes detected in each cell. Low gene counts often indicate poor-quality cells.",
|
||||
"cutoffMin": null,
|
||||
"cutoffMax": null,
|
||||
"zoomMax": null,
|
||||
"nBins": 50,
|
||||
"groupBy": "sample_id",
|
||||
"yAxisType": "linear"
|
||||
},
|
||||
{
|
||||
"type": "histogram",
|
||||
"visualizationType": "histogram",
|
||||
"field": "fraction_mitochondrial",
|
||||
"label": "Fraction UMI of mitochondrial genes per cell",
|
||||
"description": "Proportion of cell's RNA from mitochondrial genes.",
|
||||
"cutoffMin": null,
|
||||
"cutoffMax": null,
|
||||
"nBins": 50,
|
||||
"groupBy": "sample_id",
|
||||
"yAxisType": "linear"
|
||||
},
|
||||
{
|
||||
"type": "histogram",
|
||||
"visualizationType": "histogram",
|
||||
"field": "fraction_ribosomal",
|
||||
"label": "Fraction UMI of ribosomal genes per cell",
|
||||
"description": "Proportion of cell's RNA from ribosomal protein genes. Extreme values may indicate stress responses or cell cycle abnormalities.",
|
||||
"cutoffMin": null,
|
||||
"cutoffMax": null,
|
||||
"nBins": 50,
|
||||
"groupBy": "sample_id",
|
||||
"yAxisType": "linear"
|
||||
},
|
||||
{
|
||||
"type": "histogram",
|
||||
"visualizationType": "histogram",
|
||||
"field": "cell_area",
|
||||
"label": "Segmented cell area",
|
||||
"description": "Area of the segmented cells.",
|
||||
"cutoffMin": null,
|
||||
"cutoffMax": null,
|
||||
"nBins": 50,
|
||||
"groupBy": "sample_id",
|
||||
"yAxisType": "linear"
|
||||
},
|
||||
{
|
||||
"type": "histogram",
|
||||
"visualizationType": "histogram",
|
||||
"field": "nucleus_ratio",
|
||||
"label": "Nucleus Ratio",
|
||||
"description": "Ratio of the nucleus area to the segmented cell area.",
|
||||
"cutoffMin": null,
|
||||
"cutoffMax": null,
|
||||
"nBins": 50,
|
||||
"groupBy": "sample_id",
|
||||
"yAxisType": "linear"
|
||||
}
|
||||
]
|
||||
}
|
||||
]
|
||||
}
|
||||
@@ -0,0 +1,12 @@
|
||||
def setup_logger():
|
||||
import logging
|
||||
from sys import stdout
|
||||
|
||||
logger = logging.getLogger()
|
||||
logger.setLevel(logging.INFO)
|
||||
console_handler = logging.StreamHandler(stdout)
|
||||
logFormatter = logging.Formatter("%(asctime)s %(levelname)-8s %(message)s")
|
||||
console_handler.setFormatter(logFormatter)
|
||||
logger.addHandler(console_handler)
|
||||
|
||||
return logger
|
||||
@@ -0,0 +1,484 @@
|
||||
name: "h5mu_to_qc_json"
|
||||
namespace: "ingestion_qc"
|
||||
version: "v0.1.0"
|
||||
authors:
|
||||
- name: "Jakub Majercik"
|
||||
roles:
|
||||
- "author"
|
||||
info:
|
||||
role: "Contributor"
|
||||
links:
|
||||
email: "jakub@data-intuitive.com"
|
||||
github: "jakubmajercik"
|
||||
linkedin: "jakubmajercik"
|
||||
organizations:
|
||||
- name: "Data Intuitive"
|
||||
href: "https://www.data-intuitive.com"
|
||||
role: "Bioinformatics Engineer"
|
||||
- name: "Dorien Roosen"
|
||||
roles:
|
||||
- "author"
|
||||
info:
|
||||
role: "Core Team Member"
|
||||
links:
|
||||
email: "dorien@data-intuitive.com"
|
||||
github: "dorien-er"
|
||||
linkedin: "dorien-roosen"
|
||||
organizations:
|
||||
- name: "Data Intuitive"
|
||||
href: "https://www.data-intuitive.com"
|
||||
role: "Data Scientist"
|
||||
- name: "Robrecht Cannoodt"
|
||||
roles:
|
||||
- "author"
|
||||
info:
|
||||
role: "Core Team Member"
|
||||
links:
|
||||
email: "robrecht@data-intuitive.com"
|
||||
github: "rcannood"
|
||||
orcid: "0000-0003-3641-729X"
|
||||
linkedin: "robrechtcannoodt"
|
||||
organizations:
|
||||
- name: "Data Intuitive"
|
||||
href: "https://www.data-intuitive.com"
|
||||
role: "Data Science Engineer"
|
||||
- name: "Open Problems"
|
||||
href: "https://openproblems.bio"
|
||||
role: "Core Member"
|
||||
- name: "Weiwei Schultz"
|
||||
roles:
|
||||
- "contributor"
|
||||
info:
|
||||
role: "Contributor"
|
||||
organizations:
|
||||
- name: "Janssen R&D US"
|
||||
role: "Associate Director Data Sciences"
|
||||
argument_groups:
|
||||
- name: "Inputs"
|
||||
arguments:
|
||||
- type: "file"
|
||||
name: "--input"
|
||||
description: "The input h5mu file(s)"
|
||||
info: null
|
||||
example:
|
||||
- "path/to/file.h5mu"
|
||||
must_exist: true
|
||||
create_parent: true
|
||||
required: true
|
||||
direction: "input"
|
||||
multiple: true
|
||||
multiple_sep: ";"
|
||||
- type: "string"
|
||||
name: "--modality"
|
||||
description: "The modality to use"
|
||||
info: null
|
||||
default:
|
||||
- "rna"
|
||||
required: false
|
||||
direction: "input"
|
||||
multiple: false
|
||||
multiple_sep: ";"
|
||||
- type: "string"
|
||||
name: "--ingestion_method"
|
||||
description: "Method that was used to ingest the data - this will define the structure\
|
||||
\ of the report that is generated."
|
||||
info: null
|
||||
required: true
|
||||
choices:
|
||||
- "cellranger_multi"
|
||||
- "xenium"
|
||||
direction: "input"
|
||||
multiple: false
|
||||
multiple_sep: ";"
|
||||
- type: "string"
|
||||
name: "--obs_sample_id"
|
||||
description: "The key in the h5mu file that contains the sample ID. If not provided,\
|
||||
\ each H5MU file will be considered as a separate sample."
|
||||
info: null
|
||||
default:
|
||||
- "sample_id"
|
||||
required: false
|
||||
direction: "input"
|
||||
multiple: false
|
||||
multiple_sep: ";"
|
||||
- type: "string"
|
||||
name: "--obs_total_counts"
|
||||
description: "The key in the h5mu .obs field that contains the total counts."
|
||||
info: null
|
||||
default:
|
||||
- "total_counts"
|
||||
required: false
|
||||
direction: "input"
|
||||
multiple: false
|
||||
multiple_sep: ";"
|
||||
- type: "string"
|
||||
name: "--obs_num_nonzero_vars"
|
||||
description: "The key in the h5mu .obs field that contains the number of nonzero\
|
||||
\ vars."
|
||||
info: null
|
||||
default:
|
||||
- "num_nonzero_vars"
|
||||
required: false
|
||||
direction: "input"
|
||||
multiple: false
|
||||
multiple_sep: ";"
|
||||
- type: "string"
|
||||
name: "--obs_fraction_mitochondrial"
|
||||
description: "The key in the h5mu .obs field that contains the fraction mitochondrial\
|
||||
\ genes."
|
||||
info: null
|
||||
default:
|
||||
- "fraction_mitochondrial"
|
||||
required: false
|
||||
direction: "input"
|
||||
multiple: false
|
||||
multiple_sep: ";"
|
||||
- type: "string"
|
||||
name: "--obs_fraction_ribosomal"
|
||||
description: "The key in the h5mu .obs field that contains the fraction ribosomal\
|
||||
\ genes."
|
||||
info: null
|
||||
default:
|
||||
- "fraction_ribosomal"
|
||||
required: false
|
||||
direction: "input"
|
||||
multiple: false
|
||||
multiple_sep: ";"
|
||||
- name: "Outputs"
|
||||
arguments:
|
||||
- type: "file"
|
||||
name: "--output"
|
||||
description: "The output JSON file"
|
||||
info: null
|
||||
example:
|
||||
- "path/to/file.json"
|
||||
must_exist: true
|
||||
create_parent: true
|
||||
required: true
|
||||
direction: "output"
|
||||
multiple: false
|
||||
multiple_sep: ";"
|
||||
- type: "file"
|
||||
name: "--output_reporting_json"
|
||||
description: "The output JSON file that defines the QC report"
|
||||
info: null
|
||||
example:
|
||||
- "path/to/file.json"
|
||||
must_exist: true
|
||||
create_parent: true
|
||||
required: true
|
||||
direction: "output"
|
||||
multiple: false
|
||||
multiple_sep: ";"
|
||||
- name: "Filtering & grouping options"
|
||||
arguments:
|
||||
- type: "integer"
|
||||
name: "--min_total_counts"
|
||||
description: "Minimum total counts for a cell to be included in the output"
|
||||
info: null
|
||||
default:
|
||||
- 10
|
||||
required: false
|
||||
direction: "input"
|
||||
multiple: false
|
||||
multiple_sep: ";"
|
||||
- type: "integer"
|
||||
name: "--min_num_nonzero_vars"
|
||||
description: "Minimum number of nonzero vars for a cell to be included in the\
|
||||
\ output"
|
||||
info: null
|
||||
default:
|
||||
- 10
|
||||
required: false
|
||||
direction: "input"
|
||||
multiple: false
|
||||
multiple_sep: ";"
|
||||
- type: "string"
|
||||
name: "--obs_metadata"
|
||||
description: "The metadata keys in the h5mu .obs to include in the output JSON."
|
||||
info: null
|
||||
example:
|
||||
- "donor_id;cell_type;batch;condition"
|
||||
required: false
|
||||
direction: "input"
|
||||
multiple: true
|
||||
multiple_sep: ";"
|
||||
- name: "Options for CellRanger reports"
|
||||
arguments:
|
||||
- type: "string"
|
||||
name: "--obs_cellbender"
|
||||
description: "The cellbender keys in the h5mu .obs to include in the output JSON"
|
||||
info: null
|
||||
default:
|
||||
- "cellbender_background_fraction"
|
||||
- "cellbender_cell_probability"
|
||||
- "cellbender_cell_size"
|
||||
- "cellbender_droplet_efficiency"
|
||||
required: false
|
||||
direction: "input"
|
||||
multiple: true
|
||||
multiple_sep: ";"
|
||||
- type: "string"
|
||||
name: "--uns_cellranger_metrics"
|
||||
description: "The key in the h5mu file .uns that contains the cellranger metrics"
|
||||
info: null
|
||||
default:
|
||||
- "metrics_cellranger"
|
||||
required: false
|
||||
direction: "input"
|
||||
multiple: false
|
||||
multiple_sep: ";"
|
||||
- name: "Options for Xenium reports"
|
||||
arguments:
|
||||
- type: "string"
|
||||
name: "--obs_nucleus_area"
|
||||
description: "The key in the h5mu .obs field that contains the nucleus area."
|
||||
info: null
|
||||
default:
|
||||
- "nucleus_area"
|
||||
required: false
|
||||
direction: "input"
|
||||
multiple: false
|
||||
multiple_sep: ";"
|
||||
- type: "string"
|
||||
name: "--obs_cell_area"
|
||||
description: "The key in the h5mu .obs field that contains the cell area."
|
||||
info: null
|
||||
default:
|
||||
- "cell_area"
|
||||
required: false
|
||||
direction: "input"
|
||||
multiple: false
|
||||
multiple_sep: ";"
|
||||
- type: "string"
|
||||
name: "--obs_x_coord"
|
||||
description: "The key in the h5mu .obs field that contains the x coordinate."
|
||||
info: null
|
||||
default:
|
||||
- "x_coord"
|
||||
required: false
|
||||
direction: "input"
|
||||
multiple: false
|
||||
multiple_sep: ";"
|
||||
- type: "string"
|
||||
name: "--obs_y_coord"
|
||||
description: "The key in the h5mu .obs field that contains the y coordinate."
|
||||
info: null
|
||||
default:
|
||||
- "y_coord"
|
||||
required: false
|
||||
direction: "input"
|
||||
multiple: false
|
||||
multiple_sep: ";"
|
||||
- type: "string"
|
||||
name: "--obs_control_probe_counts"
|
||||
description: "The key in the h5mu .obs field that contains the number of control\
|
||||
\ probes."
|
||||
info: null
|
||||
default:
|
||||
- "control_probe_counts"
|
||||
required: false
|
||||
direction: "input"
|
||||
multiple: false
|
||||
multiple_sep: ";"
|
||||
- type: "string"
|
||||
name: "--obs_control_codeword_counts"
|
||||
description: "The key in the h5mu .obs field that contains the number of control\
|
||||
\ codewords."
|
||||
info: null
|
||||
default:
|
||||
- "control_codeword_counts"
|
||||
required: false
|
||||
direction: "input"
|
||||
multiple: false
|
||||
multiple_sep: ";"
|
||||
resources:
|
||||
- type: "python_script"
|
||||
path: "script.py"
|
||||
is_executable: true
|
||||
- type: "file"
|
||||
path: "setup_logger.py"
|
||||
- type: "file"
|
||||
path: "report_structure"
|
||||
- type: "file"
|
||||
path: "nextflow_labels.config"
|
||||
dest: "nextflow_labels.config"
|
||||
description: "Takes H5MU files that have been ingested by CellRanger, Xenium or CosMx\
|
||||
\ and processed by the QC workflow, and generates:\n- A JSON file that contains\
|
||||
\ the combined data for the QC report\n- A JSON file that defines the layout and\
|
||||
\ structure of the QC report\n"
|
||||
test_resources:
|
||||
- type: "python_script"
|
||||
path: "test.py"
|
||||
is_executable: true
|
||||
- type: "file"
|
||||
path: "resources_test"
|
||||
info: null
|
||||
status: "enabled"
|
||||
scope:
|
||||
image: "private"
|
||||
target: "private"
|
||||
requirements:
|
||||
commands:
|
||||
- "ps"
|
||||
repositories:
|
||||
- type: "github"
|
||||
name: "openpipeline"
|
||||
repo: "openpipelines-bio/openpipeline"
|
||||
tag: "2.1.2"
|
||||
- type: "vsh"
|
||||
name: "craftbox"
|
||||
repo: "craftbox"
|
||||
tag: "v0.2.0"
|
||||
links:
|
||||
repository: "https://github.com/openpipelines-bio/openpipeline_qc"
|
||||
docker_registry: "ghcr.io"
|
||||
runners:
|
||||
- type: "executable"
|
||||
id: "executable"
|
||||
docker_setup_strategy: "ifneedbepullelsecachedbuild"
|
||||
- type: "nextflow"
|
||||
id: "nextflow"
|
||||
directives:
|
||||
label:
|
||||
- "midmem"
|
||||
- "middisk"
|
||||
tag: "$id"
|
||||
auto:
|
||||
simplifyInput: true
|
||||
simplifyOutput: false
|
||||
transcript: false
|
||||
publish: false
|
||||
config:
|
||||
labels:
|
||||
mem1gb: "memory = 1000000000.B"
|
||||
mem2gb: "memory = 2000000000.B"
|
||||
mem5gb: "memory = 5000000000.B"
|
||||
mem10gb: "memory = 10000000000.B"
|
||||
mem20gb: "memory = 20000000000.B"
|
||||
mem50gb: "memory = 50000000000.B"
|
||||
mem100gb: "memory = 100000000000.B"
|
||||
mem200gb: "memory = 200000000000.B"
|
||||
mem500gb: "memory = 500000000000.B"
|
||||
mem1tb: "memory = 1000000000000.B"
|
||||
mem2tb: "memory = 2000000000000.B"
|
||||
mem5tb: "memory = 5000000000000.B"
|
||||
mem10tb: "memory = 10000000000000.B"
|
||||
mem20tb: "memory = 20000000000000.B"
|
||||
mem50tb: "memory = 50000000000000.B"
|
||||
mem100tb: "memory = 100000000000000.B"
|
||||
mem200tb: "memory = 200000000000000.B"
|
||||
mem500tb: "memory = 500000000000000.B"
|
||||
mem1gib: "memory = 1073741824.B"
|
||||
mem2gib: "memory = 2147483648.B"
|
||||
mem4gib: "memory = 4294967296.B"
|
||||
mem8gib: "memory = 8589934592.B"
|
||||
mem16gib: "memory = 17179869184.B"
|
||||
mem32gib: "memory = 34359738368.B"
|
||||
mem64gib: "memory = 68719476736.B"
|
||||
mem128gib: "memory = 137438953472.B"
|
||||
mem256gib: "memory = 274877906944.B"
|
||||
mem512gib: "memory = 549755813888.B"
|
||||
mem1tib: "memory = 1099511627776.B"
|
||||
mem2tib: "memory = 2199023255552.B"
|
||||
mem4tib: "memory = 4398046511104.B"
|
||||
mem8tib: "memory = 8796093022208.B"
|
||||
mem16tib: "memory = 17592186044416.B"
|
||||
mem32tib: "memory = 35184372088832.B"
|
||||
mem64tib: "memory = 70368744177664.B"
|
||||
mem128tib: "memory = 140737488355328.B"
|
||||
mem256tib: "memory = 281474976710656.B"
|
||||
mem512tib: "memory = 562949953421312.B"
|
||||
cpu1: "cpus = 1"
|
||||
cpu2: "cpus = 2"
|
||||
cpu5: "cpus = 5"
|
||||
cpu10: "cpus = 10"
|
||||
cpu20: "cpus = 20"
|
||||
cpu50: "cpus = 50"
|
||||
cpu100: "cpus = 100"
|
||||
cpu200: "cpus = 200"
|
||||
cpu500: "cpus = 500"
|
||||
cpu1000: "cpus = 1000"
|
||||
script:
|
||||
- "includeConfig(\"nextflow_labels.config\")"
|
||||
debug: false
|
||||
container: "docker"
|
||||
engines:
|
||||
- type: "docker"
|
||||
id: "docker"
|
||||
image: "python:3.12-slim"
|
||||
target_registry: "images.viash-hub.com"
|
||||
target_tag: "v0.1.0"
|
||||
namespace_separator: "/"
|
||||
setup:
|
||||
- type: "apt"
|
||||
packages:
|
||||
- "procps"
|
||||
interactive: false
|
||||
- type: "python"
|
||||
user: false
|
||||
packages:
|
||||
- "anndata~=0.11.1"
|
||||
- "mudata~=0.3.1"
|
||||
script:
|
||||
- "exec(\"try:\\n import awkward\\nexcept ModuleNotFoundError:\\n exit(0)\\\
|
||||
nelse: exit(1)\")"
|
||||
upgrade: true
|
||||
test_setup:
|
||||
- type: "apt"
|
||||
packages:
|
||||
- "git"
|
||||
interactive: false
|
||||
- type: "python"
|
||||
user: false
|
||||
packages:
|
||||
- "viashpy==0.8.0"
|
||||
github:
|
||||
- "openpipelines-bio/core#subdirectory=packages/python/openpipeline_testutils"
|
||||
upgrade: true
|
||||
entrypoint: []
|
||||
cmd: null
|
||||
- type: "native"
|
||||
id: "native"
|
||||
build_info:
|
||||
config: "src/ingestion_qc/h5mu_to_qc_json/config.vsh.yaml"
|
||||
runner: "nextflow"
|
||||
engine: "docker|native"
|
||||
output: "target/_private/nextflow/ingestion_qc/h5mu_to_qc_json"
|
||||
executable: "target/_private/nextflow/ingestion_qc/h5mu_to_qc_json/main.nf"
|
||||
viash_version: "0.9.4"
|
||||
git_commit: "4de00a2614069bdaee27943e73a51d378e465c60"
|
||||
git_remote: "https://github.com/openpipelines-bio/openpipeline_qc"
|
||||
git_tag: "v0.1.0"
|
||||
package_config:
|
||||
name: "openpipeline_qc"
|
||||
version: "v0.1.0"
|
||||
info:
|
||||
test_resources:
|
||||
- type: "s3"
|
||||
path: "s3://openpipelines-bio/openpipeline_incubator/resources_test"
|
||||
dest: "resources_test"
|
||||
repositories:
|
||||
- type: "github"
|
||||
name: "openpipeline"
|
||||
repo: "openpipelines-bio/openpipeline"
|
||||
tag: "2.1.2"
|
||||
- type: "vsh"
|
||||
name: "craftbox"
|
||||
repo: "craftbox"
|
||||
tag: "v0.2.0"
|
||||
viash_version: "0.9.4"
|
||||
source: "src"
|
||||
target: "target"
|
||||
config_mods:
|
||||
- ".requirements.commands := ['ps']\n.runners[.type == 'nextflow'].directives.tag\
|
||||
\ := '$id'\n.resources += {path: '/src/configs/labels.config', dest: 'nextflow_labels.config'}\n\
|
||||
.runners[.type == 'nextflow'].config.script := 'includeConfig(\"nextflow_labels.config\"\
|
||||
)'"
|
||||
- ".engines += { type: \"native\" }"
|
||||
- ".engines[.type == 'docker'].target_registry := 'images.viash-hub.com'"
|
||||
- ".engines[.type == 'docker'].target_tag := 'v0.1.0'"
|
||||
organization: "vsh"
|
||||
links:
|
||||
repository: "https://github.com/openpipelines-bio/openpipeline_qc"
|
||||
docker_registry: "ghcr.io"
|
||||
4574
target/_private/nextflow/ingestion_qc/h5mu_to_qc_json/main.nf
Normal file
4574
target/_private/nextflow/ingestion_qc/h5mu_to_qc_json/main.nf
Normal file
File diff suppressed because it is too large
Load Diff
@@ -0,0 +1,126 @@
|
||||
manifest {
|
||||
name = 'ingestion_qc/h5mu_to_qc_json'
|
||||
mainScript = 'main.nf'
|
||||
nextflowVersion = '!>=20.12.1-edge'
|
||||
version = 'v0.1.0'
|
||||
description = 'Takes H5MU files that have been ingested by CellRanger, Xenium or CosMx and processed by the QC workflow, and generates:\n- A JSON file that contains the combined data for the QC report\n- A JSON file that defines the layout and structure of the QC report\n'
|
||||
author = 'Jakub Majercik, Dorien Roosen, Robrecht Cannoodt, Weiwei Schultz'
|
||||
}
|
||||
|
||||
process.container = 'nextflow/bash:latest'
|
||||
|
||||
// detect tempdir
|
||||
tempDir = java.nio.file.Paths.get(
|
||||
System.getenv('NXF_TEMP') ?:
|
||||
System.getenv('VIASH_TEMP') ?:
|
||||
System.getenv('TEMPDIR') ?:
|
||||
System.getenv('TMPDIR') ?:
|
||||
'/tmp'
|
||||
).toAbsolutePath()
|
||||
|
||||
profiles {
|
||||
no_publish {
|
||||
process {
|
||||
withName: '.*' {
|
||||
publishDir = [
|
||||
enabled: false
|
||||
]
|
||||
}
|
||||
}
|
||||
}
|
||||
mount_temp {
|
||||
docker.temp = tempDir
|
||||
podman.temp = tempDir
|
||||
charliecloud.temp = tempDir
|
||||
}
|
||||
docker {
|
||||
docker.enabled = true
|
||||
// docker.userEmulation = true
|
||||
singularity.enabled = false
|
||||
podman.enabled = false
|
||||
shifter.enabled = false
|
||||
charliecloud.enabled = false
|
||||
}
|
||||
singularity {
|
||||
singularity.enabled = true
|
||||
singularity.autoMounts = true
|
||||
docker.enabled = false
|
||||
podman.enabled = false
|
||||
shifter.enabled = false
|
||||
charliecloud.enabled = false
|
||||
}
|
||||
podman {
|
||||
podman.enabled = true
|
||||
docker.enabled = false
|
||||
singularity.enabled = false
|
||||
shifter.enabled = false
|
||||
charliecloud.enabled = false
|
||||
}
|
||||
shifter {
|
||||
shifter.enabled = true
|
||||
docker.enabled = false
|
||||
singularity.enabled = false
|
||||
podman.enabled = false
|
||||
charliecloud.enabled = false
|
||||
}
|
||||
charliecloud {
|
||||
charliecloud.enabled = true
|
||||
docker.enabled = false
|
||||
singularity.enabled = false
|
||||
podman.enabled = false
|
||||
shifter.enabled = false
|
||||
}
|
||||
}
|
||||
|
||||
process{
|
||||
withLabel: mem1gb { memory = 1000000000.B }
|
||||
withLabel: mem2gb { memory = 2000000000.B }
|
||||
withLabel: mem5gb { memory = 5000000000.B }
|
||||
withLabel: mem10gb { memory = 10000000000.B }
|
||||
withLabel: mem20gb { memory = 20000000000.B }
|
||||
withLabel: mem50gb { memory = 50000000000.B }
|
||||
withLabel: mem100gb { memory = 100000000000.B }
|
||||
withLabel: mem200gb { memory = 200000000000.B }
|
||||
withLabel: mem500gb { memory = 500000000000.B }
|
||||
withLabel: mem1tb { memory = 1000000000000.B }
|
||||
withLabel: mem2tb { memory = 2000000000000.B }
|
||||
withLabel: mem5tb { memory = 5000000000000.B }
|
||||
withLabel: mem10tb { memory = 10000000000000.B }
|
||||
withLabel: mem20tb { memory = 20000000000000.B }
|
||||
withLabel: mem50tb { memory = 50000000000000.B }
|
||||
withLabel: mem100tb { memory = 100000000000000.B }
|
||||
withLabel: mem200tb { memory = 200000000000000.B }
|
||||
withLabel: mem500tb { memory = 500000000000000.B }
|
||||
withLabel: mem1gib { memory = 1073741824.B }
|
||||
withLabel: mem2gib { memory = 2147483648.B }
|
||||
withLabel: mem4gib { memory = 4294967296.B }
|
||||
withLabel: mem8gib { memory = 8589934592.B }
|
||||
withLabel: mem16gib { memory = 17179869184.B }
|
||||
withLabel: mem32gib { memory = 34359738368.B }
|
||||
withLabel: mem64gib { memory = 68719476736.B }
|
||||
withLabel: mem128gib { memory = 137438953472.B }
|
||||
withLabel: mem256gib { memory = 274877906944.B }
|
||||
withLabel: mem512gib { memory = 549755813888.B }
|
||||
withLabel: mem1tib { memory = 1099511627776.B }
|
||||
withLabel: mem2tib { memory = 2199023255552.B }
|
||||
withLabel: mem4tib { memory = 4398046511104.B }
|
||||
withLabel: mem8tib { memory = 8796093022208.B }
|
||||
withLabel: mem16tib { memory = 17592186044416.B }
|
||||
withLabel: mem32tib { memory = 35184372088832.B }
|
||||
withLabel: mem64tib { memory = 70368744177664.B }
|
||||
withLabel: mem128tib { memory = 140737488355328.B }
|
||||
withLabel: mem256tib { memory = 281474976710656.B }
|
||||
withLabel: mem512tib { memory = 562949953421312.B }
|
||||
withLabel: cpu1 { cpus = 1 }
|
||||
withLabel: cpu2 { cpus = 2 }
|
||||
withLabel: cpu5 { cpus = 5 }
|
||||
withLabel: cpu10 { cpus = 10 }
|
||||
withLabel: cpu20 { cpus = 20 }
|
||||
withLabel: cpu50 { cpus = 50 }
|
||||
withLabel: cpu100 { cpus = 100 }
|
||||
withLabel: cpu200 { cpus = 200 }
|
||||
withLabel: cpu500 { cpus = 500 }
|
||||
withLabel: cpu1000 { cpus = 1000 }
|
||||
}
|
||||
|
||||
includeConfig("nextflow_labels.config")
|
||||
@@ -0,0 +1,66 @@
|
||||
process {
|
||||
// Default resources for components that hardly do any processing
|
||||
memory = { 2.GB * task.attempt }
|
||||
cpus = 1
|
||||
|
||||
// Retry for exit codes that have something to do with memory issues
|
||||
errorStrategy = { task.exitStatus in 137..140 ? 'retry' : 'terminate' }
|
||||
maxRetries = 3
|
||||
maxMemory = null
|
||||
|
||||
// CPU resources
|
||||
withLabel: singlecpu { cpus = 1 }
|
||||
withLabel: lowcpu { cpus = 4 }
|
||||
withLabel: midcpu { cpus = 10 }
|
||||
withLabel: highcpu { cpus = 20 }
|
||||
|
||||
// Memory resources
|
||||
withLabel: verylowmem { memory = { get_memory( 4.GB * task.attempt ) } }
|
||||
withLabel: lowmem { memory = { get_memory( 8.GB * task.attempt ) } }
|
||||
withLabel: midmem { memory = { get_memory( 16.GB * task.attempt ) } }
|
||||
withLabel: highmem { memory = { get_memory( 64.GB * task.attempt ) } }
|
||||
withLabel: veryhighmem { memory = { get_memory( 75.GB * task.attempt ) } }
|
||||
|
||||
// Disk space
|
||||
withLabel: lowdisk {
|
||||
disk = {process.disk ? process.disk : null}
|
||||
}
|
||||
withLabel: middisk {
|
||||
disk = {process.disk ? process.disk : null}
|
||||
}
|
||||
withLabel: highdisk {
|
||||
disk = {process.disk ? process.disk : null}
|
||||
}
|
||||
withLabel: veryhighdisk {
|
||||
disk = {process.disk ? process.disk : null}
|
||||
}
|
||||
// NOTE: The above labels intentionally do not have an effect by default.
|
||||
// The user should set the disk space requirements by adding the following
|
||||
// to the compute environment:
|
||||
//
|
||||
// withLabel: lowdisk { disk = { 20.GB * task.attempt } }
|
||||
// withLabel: middisk { disk = { 100.GB * task.attempt } }
|
||||
// withLabel: highdisk { disk = { 200.GB * task.attempt } }
|
||||
// withLabel: veryhighdisk { disk = { 500.GB * task.attempt } }
|
||||
}
|
||||
|
||||
def get_memory(to_compare) {
|
||||
if (!process.containsKey("maxMemory") || !process.maxMemory) {
|
||||
return to_compare
|
||||
}
|
||||
|
||||
try {
|
||||
if (process.containsKey("maxRetries") && process.maxRetries && task.attempt == (process.maxRetries as int)) {
|
||||
return process.maxMemory
|
||||
}
|
||||
else if (to_compare.compareTo(process.maxMemory as nextflow.util.MemoryUnit) == 1) {
|
||||
return max_memory as nextflow.util.MemoryUnit
|
||||
}
|
||||
else {
|
||||
return to_compare
|
||||
}
|
||||
} catch (all) {
|
||||
println "Error processing memory resources. Please check that process.maxMemory '${process.maxMemory}' and process.maxRetries '${process.maxRetries}' are valid!"
|
||||
System.exit(1)
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,162 @@
|
||||
{
|
||||
"categories": [
|
||||
{
|
||||
"name": "Sample QC",
|
||||
"key": "sample_summary_stats",
|
||||
"additionalAxes": false,
|
||||
"defaultFilters": []
|
||||
},
|
||||
{
|
||||
"name": "SampleQC",
|
||||
"key": "metrics_cellranger_stats",
|
||||
"additionalAxes": false,
|
||||
"defaultFilters": [
|
||||
{
|
||||
"type": "bar",
|
||||
"field": "Number_of_reads_in_the_library",
|
||||
"label": "Number of reads per library",
|
||||
"description": "Sequencing depth per sample. Higher values generally indicate more comprehensive cell profiling.",
|
||||
"nBins": 10,
|
||||
"groupBy": "sample_id",
|
||||
"xAxisType": "linear",
|
||||
"yAxisType": "linear"
|
||||
},
|
||||
{
|
||||
"type": "bar",
|
||||
"field": "Confidently_mapped_reads_in_cells",
|
||||
"label": "Confidently mapped reads in cells",
|
||||
"description": "Number of reads that were mapped unambiguously to the reference genome within cell-containing droplets.",
|
||||
"groupBy": "sample_id",
|
||||
"nBins": 10,
|
||||
"yAxisType": "linear"
|
||||
},
|
||||
{
|
||||
"type": "bar",
|
||||
"field": "Estimated_number_of_cells",
|
||||
"label": "Estimated number of cells",
|
||||
"description": "CellRanger's estimate of the number of cells per sample based on the UMI count distribution.",
|
||||
"groupBy": "sample_id",
|
||||
"nBins": 10,
|
||||
"yAxisType": "linear"
|
||||
},
|
||||
{
|
||||
"type": "bar",
|
||||
"field": "Sequencing_saturation",
|
||||
"label": "Sequencing saturation",
|
||||
"description": "Fraction of reads that are duplicates of existing UMIs. Higher values suggest deeper sequencing coverage.",
|
||||
"groupBy": "sample_id",
|
||||
"nBins": 10,
|
||||
"yAxisType": "linear"
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"name": "Cell RNA QC",
|
||||
"key": "cell_rna_stats",
|
||||
"additionalAxes": true,
|
||||
"defaultFilters": [
|
||||
{
|
||||
"type": "histogram",
|
||||
"field": "total_counts",
|
||||
"label": "Total UMI per cell",
|
||||
"description": "Total number of RNA molecules detected per cell. Low values typically indicate empty droplets or low-quality cells that should be filtered out.",
|
||||
"cutoffMin": null,
|
||||
"cutoffMax": null,
|
||||
"zoomMax": null,
|
||||
"nBins": 50,
|
||||
"groupBy": "sample_id",
|
||||
"yAxisType": "linear"
|
||||
},
|
||||
{
|
||||
"type": "histogram",
|
||||
"field": "num_nonzero_vars",
|
||||
"label": "Number of non-zero genes per cell",
|
||||
"description": "Count of unique genes detected in each cell. Low gene counts often indicate poor-quality cells.",
|
||||
"cutoffMin": null,
|
||||
"cutoffMax": null,
|
||||
"zoomMax": null,
|
||||
"nBins": 50,
|
||||
"groupBy": "sample_id",
|
||||
"yAxisType": "linear"
|
||||
},
|
||||
{
|
||||
"type": "histogram",
|
||||
"field": "fraction_mitochondrial",
|
||||
"label": "Fraction UMI of mitochondrial genes per cell",
|
||||
"description": "Proportion of cell's RNA from mitochondrial genes.",
|
||||
"cutoffMin": null,
|
||||
"cutoffMax": null,
|
||||
"nBins": 50,
|
||||
"groupBy": "sample_id",
|
||||
"yAxisType": "linear"
|
||||
},
|
||||
{
|
||||
"type": "histogram",
|
||||
"field": "fraction_ribosomal",
|
||||
"label": "Fraction UMI of ribosomal genes per cell",
|
||||
"description": "Proportion of cell's RNA from ribosomal protein genes. Extreme values may indicate stress responses or cell cycle abnormalities.",
|
||||
"cutoffMin": null,
|
||||
"cutoffMax": null,
|
||||
"nBins": 50,
|
||||
"groupBy": "sample_id",
|
||||
"yAxisType": "linear"
|
||||
},
|
||||
{
|
||||
"type": "histogram",
|
||||
"field": "pct_of_counts_in_top_50_vars",
|
||||
"label": "Fraction UMI in top 50 genes per cell",
|
||||
"description": "Proportion of RNA molecules from the 50 most-expressed genes in each cell.",
|
||||
"cutoffMin": null,
|
||||
"cutoffMax": null,
|
||||
"nBins": 50,
|
||||
"groupBy": "sample_id",
|
||||
"yAxisType": "linear"
|
||||
},
|
||||
{
|
||||
"type": "histogram",
|
||||
"field": "cellbender_cell_probability",
|
||||
"label": "CellBender cell probability",
|
||||
"description": "CellBender's statistical confidence (0-1) that a barcode represents a real cell, with higher values indicating stronger confidence.",
|
||||
"cutoffMin": null,
|
||||
"cutoffMax": null,
|
||||
"nBins": 50,
|
||||
"groupBy": "sample_id",
|
||||
"yAxisType": "linear"
|
||||
},
|
||||
{
|
||||
"type": "histogram",
|
||||
"field": "cellbender_background_fraction",
|
||||
"label": "CellBender background fraction",
|
||||
"description": "Estimated percentage of each cell's RNA that comes from the ambient solution rather than the cell itself.",
|
||||
"cutoffMin": null,
|
||||
"cutoffMax": null,
|
||||
"nBins": 50,
|
||||
"groupBy": "sample_id",
|
||||
"yAxisType": "linear"
|
||||
},
|
||||
{
|
||||
"type": "histogram",
|
||||
"field": "cellbender_cell_size",
|
||||
"label": "CellBender cell size",
|
||||
"description": "CellBender's estimate of the true number of RNA molecules in each cell after removing ambient contamination. Reflects actual cell RNA content rather than raw UMI counts.",
|
||||
"cutoffMin": null,
|
||||
"cutoffMax": null,
|
||||
"nBins": 50,
|
||||
"groupBy": "sample_id",
|
||||
"yAxisType": "linear"
|
||||
},
|
||||
{
|
||||
"type": "histogram",
|
||||
"field": "cellbender_droplet_efficiency",
|
||||
"label": "CellBender droplet efficiency",
|
||||
"description": "CellBender's estimate of how efficiently each droplet captured RNA molecules. Higher values indicate more reliable RNA sampling within individual droplets.",
|
||||
"cutoffMin": null,
|
||||
"cutoffMax": null,
|
||||
"nBins": 50,
|
||||
"groupBy": "sample_id",
|
||||
"yAxisType": "linear"
|
||||
}
|
||||
]
|
||||
}
|
||||
]
|
||||
}
|
||||
@@ -0,0 +1,91 @@
|
||||
{
|
||||
"categories": [
|
||||
{
|
||||
"name": "Sample QC",
|
||||
"key": "sample_summary_stats",
|
||||
"additionalAxes": false,
|
||||
"defaultFilters": []
|
||||
},
|
||||
{
|
||||
"name": "Cell RNA QC",
|
||||
"key": "cell_rna_stats",
|
||||
"additionalAxes": true,
|
||||
"defaultFilters": [
|
||||
{
|
||||
"type": "histogram",
|
||||
"visualizationType": "histogram",
|
||||
"field": "total_counts",
|
||||
"label": "Total UMI per cell",
|
||||
"description": "Total number of RNA molecules detected per cell. Low values typically indicate empty droplets or low-quality cells that should be filtered out.",
|
||||
"cutoffMin": null,
|
||||
"cutoffMax": null,
|
||||
"zoomMax": null,
|
||||
"nBins": 50,
|
||||
"groupBy": "sample_id",
|
||||
"yAxisType": "linear"
|
||||
},
|
||||
{
|
||||
"type": "histogram",
|
||||
"visualizationType": "histogram",
|
||||
"field": "num_nonzero_vars",
|
||||
"label": "Number of non-zero genes per cell",
|
||||
"description": "Count of unique genes detected in each cell. Low gene counts often indicate poor-quality cells.",
|
||||
"cutoffMin": null,
|
||||
"cutoffMax": null,
|
||||
"zoomMax": null,
|
||||
"nBins": 50,
|
||||
"groupBy": "sample_id",
|
||||
"yAxisType": "linear"
|
||||
},
|
||||
{
|
||||
"type": "histogram",
|
||||
"visualizationType": "histogram",
|
||||
"field": "fraction_mitochondrial",
|
||||
"label": "Fraction UMI of mitochondrial genes per cell",
|
||||
"description": "Proportion of cell's RNA from mitochondrial genes.",
|
||||
"cutoffMin": null,
|
||||
"cutoffMax": null,
|
||||
"nBins": 50,
|
||||
"groupBy": "sample_id",
|
||||
"yAxisType": "linear"
|
||||
},
|
||||
{
|
||||
"type": "histogram",
|
||||
"visualizationType": "histogram",
|
||||
"field": "fraction_ribosomal",
|
||||
"label": "Fraction UMI of ribosomal genes per cell",
|
||||
"description": "Proportion of cell's RNA from ribosomal protein genes. Extreme values may indicate stress responses or cell cycle abnormalities.",
|
||||
"cutoffMin": null,
|
||||
"cutoffMax": null,
|
||||
"nBins": 50,
|
||||
"groupBy": "sample_id",
|
||||
"yAxisType": "linear"
|
||||
},
|
||||
{
|
||||
"type": "histogram",
|
||||
"visualizationType": "histogram",
|
||||
"field": "cell_area",
|
||||
"label": "Segmented cell area",
|
||||
"description": "Area of the segmented cells.",
|
||||
"cutoffMin": null,
|
||||
"cutoffMax": null,
|
||||
"nBins": 50,
|
||||
"groupBy": "sample_id",
|
||||
"yAxisType": "linear"
|
||||
},
|
||||
{
|
||||
"type": "histogram",
|
||||
"visualizationType": "histogram",
|
||||
"field": "nucleus_ratio",
|
||||
"label": "Nucleus Ratio",
|
||||
"description": "Ratio of the nucleus area to the segmented cell area.",
|
||||
"cutoffMin": null,
|
||||
"cutoffMax": null,
|
||||
"nBins": 50,
|
||||
"groupBy": "sample_id",
|
||||
"yAxisType": "linear"
|
||||
}
|
||||
]
|
||||
}
|
||||
]
|
||||
}
|
||||
@@ -0,0 +1,12 @@
|
||||
def setup_logger():
|
||||
import logging
|
||||
from sys import stdout
|
||||
|
||||
logger = logging.getLogger()
|
||||
logger.setLevel(logging.INFO)
|
||||
console_handler = logging.StreamHandler(stdout)
|
||||
logFormatter = logging.Formatter("%(asctime)s %(levelname)-8s %(message)s")
|
||||
console_handler.setFormatter(logFormatter)
|
||||
logger.addHandler(console_handler)
|
||||
|
||||
return logger
|
||||
@@ -0,0 +1,647 @@
|
||||
name: "cellbender_remove_background"
|
||||
namespace: "correction"
|
||||
version: "2.1.2"
|
||||
argument_groups:
|
||||
- name: "Inputs"
|
||||
arguments:
|
||||
- type: "file"
|
||||
name: "--input"
|
||||
alternatives:
|
||||
- "-i"
|
||||
description: "Input h5mu file. Data file on which to run tool. Data must be un-filtered:\
|
||||
\ it should include empty droplets."
|
||||
info: null
|
||||
example:
|
||||
- "input.h5mu"
|
||||
must_exist: true
|
||||
create_parent: true
|
||||
required: true
|
||||
direction: "input"
|
||||
multiple: false
|
||||
multiple_sep: ";"
|
||||
- type: "string"
|
||||
name: "--modality"
|
||||
description: "List of modalities to process."
|
||||
info: null
|
||||
default:
|
||||
- "rna"
|
||||
required: false
|
||||
direction: "input"
|
||||
multiple: false
|
||||
multiple_sep: ";"
|
||||
- name: "Outputs"
|
||||
arguments:
|
||||
- type: "file"
|
||||
name: "--output"
|
||||
alternatives:
|
||||
- "-o"
|
||||
description: "Full count matrix as an h5mu file, with background RNA removed.\
|
||||
\ This file contains all the original droplet barcodes."
|
||||
info: null
|
||||
example:
|
||||
- "output.h5mu"
|
||||
must_exist: true
|
||||
create_parent: true
|
||||
required: true
|
||||
direction: "output"
|
||||
multiple: false
|
||||
multiple_sep: ";"
|
||||
- type: "string"
|
||||
name: "--output_compression"
|
||||
info: null
|
||||
example:
|
||||
- "gzip"
|
||||
required: false
|
||||
choices:
|
||||
- "gzip"
|
||||
- "lzf"
|
||||
direction: "input"
|
||||
multiple: false
|
||||
multiple_sep: ";"
|
||||
- type: "string"
|
||||
name: "--layer_output"
|
||||
description: "Output layer"
|
||||
info: null
|
||||
default:
|
||||
- "cellbender_corrected"
|
||||
required: false
|
||||
direction: "input"
|
||||
multiple: false
|
||||
multiple_sep: ";"
|
||||
- type: "string"
|
||||
name: "--obs_background_fraction"
|
||||
info: null
|
||||
default:
|
||||
- "cellbender_background_fraction"
|
||||
required: false
|
||||
direction: "input"
|
||||
multiple: false
|
||||
multiple_sep: ";"
|
||||
- type: "string"
|
||||
name: "--obs_cell_probability"
|
||||
info: null
|
||||
default:
|
||||
- "cellbender_cell_probability"
|
||||
required: false
|
||||
direction: "input"
|
||||
multiple: false
|
||||
multiple_sep: ";"
|
||||
- type: "string"
|
||||
name: "--obs_cell_size"
|
||||
info: null
|
||||
default:
|
||||
- "cellbender_cell_size"
|
||||
required: false
|
||||
direction: "input"
|
||||
multiple: false
|
||||
multiple_sep: ";"
|
||||
- type: "string"
|
||||
name: "--obs_droplet_efficiency"
|
||||
info: null
|
||||
default:
|
||||
- "cellbender_droplet_efficiency"
|
||||
required: false
|
||||
direction: "input"
|
||||
multiple: false
|
||||
multiple_sep: ";"
|
||||
- type: "string"
|
||||
name: "--obs_latent_scale"
|
||||
info: null
|
||||
default:
|
||||
- "cellbender_latent_scale"
|
||||
required: false
|
||||
direction: "input"
|
||||
multiple: false
|
||||
multiple_sep: ";"
|
||||
- type: "string"
|
||||
name: "--var_ambient_expression"
|
||||
info: null
|
||||
default:
|
||||
- "cellbender_ambient_expression"
|
||||
required: false
|
||||
direction: "input"
|
||||
multiple: false
|
||||
multiple_sep: ";"
|
||||
- type: "string"
|
||||
name: "--obsm_gene_expression_encoding"
|
||||
info: null
|
||||
default:
|
||||
- "cellbender_gene_expression_encoding"
|
||||
required: false
|
||||
direction: "input"
|
||||
multiple: false
|
||||
multiple_sep: ";"
|
||||
- name: "Arguments"
|
||||
arguments:
|
||||
- type: "boolean"
|
||||
name: "--expected_cells_from_qc"
|
||||
description: "Will use the Cell Ranger QC to determine the estimated number of\
|
||||
\ cells"
|
||||
info: null
|
||||
default:
|
||||
- false
|
||||
required: false
|
||||
direction: "input"
|
||||
multiple: false
|
||||
multiple_sep: ";"
|
||||
- type: "integer"
|
||||
name: "--expected_cells"
|
||||
description: "Number of cells expected in the dataset (a rough estimate within\
|
||||
\ a factor of 2 is sufficient)."
|
||||
info: null
|
||||
example:
|
||||
- 1000
|
||||
required: false
|
||||
direction: "input"
|
||||
multiple: false
|
||||
multiple_sep: ";"
|
||||
- type: "integer"
|
||||
name: "--total_droplets_included"
|
||||
description: "The number of droplets from the rank-ordered UMI plot\nthat will\
|
||||
\ have their cell probabilities inferred as an\noutput. Include the droplets\
|
||||
\ which might contain cells.\nDroplets beyond TOTAL_DROPLETS_INCLUDED should\
|
||||
\ be\n'surely empty' droplets.\n"
|
||||
info: null
|
||||
example:
|
||||
- 25000
|
||||
required: false
|
||||
direction: "input"
|
||||
multiple: false
|
||||
multiple_sep: ";"
|
||||
- type: "integer"
|
||||
name: "--force_cell_umi_prior"
|
||||
description: "Ignore CellBender's heuristic prior estimation, and use this prior\
|
||||
\ for UMI counts in cells."
|
||||
info: null
|
||||
required: false
|
||||
direction: "input"
|
||||
multiple: false
|
||||
multiple_sep: ";"
|
||||
- type: "integer"
|
||||
name: "--force_empty_umi_prior"
|
||||
description: "Ignore CellBender's heuristic prior estimation, and use this prior\
|
||||
\ for UMI counts in empty droplets."
|
||||
info: null
|
||||
required: false
|
||||
direction: "input"
|
||||
multiple: false
|
||||
multiple_sep: ";"
|
||||
- type: "string"
|
||||
name: "--model"
|
||||
description: "Which model is being used for count data.\n\n* 'naive' subtracts\
|
||||
\ the estimated ambient profile.\n* 'simple' does not model either ambient RNA\
|
||||
\ or random barcode swapping (for debugging purposes -- not recommended).\n\
|
||||
* 'ambient' assumes background RNA is incorporated into droplets.\n* 'swapping'\
|
||||
\ assumes background RNA comes from random barcode swapping (via PCR chimeras).\n\
|
||||
* 'full' uses a combined ambient and swapping model.\n"
|
||||
info: null
|
||||
default:
|
||||
- "full"
|
||||
required: false
|
||||
choices:
|
||||
- "naive"
|
||||
- "simple"
|
||||
- "ambient"
|
||||
- "swapping"
|
||||
- "full"
|
||||
direction: "input"
|
||||
multiple: false
|
||||
multiple_sep: ";"
|
||||
- type: "integer"
|
||||
name: "--epochs"
|
||||
description: "Number of epochs to train."
|
||||
info: null
|
||||
default:
|
||||
- 150
|
||||
required: false
|
||||
direction: "input"
|
||||
multiple: false
|
||||
multiple_sep: ";"
|
||||
- type: "integer"
|
||||
name: "--low_count_threshold"
|
||||
description: "Droplets with UMI counts below this number are completely \nexcluded\
|
||||
\ from the analysis. This can help identify the correct \nprior for empty droplet\
|
||||
\ counts in the rare case where empty \ncounts are extremely high (over 200).\n"
|
||||
info: null
|
||||
default:
|
||||
- 5
|
||||
required: false
|
||||
direction: "input"
|
||||
multiple: false
|
||||
multiple_sep: ";"
|
||||
- type: "integer"
|
||||
name: "--z_dim"
|
||||
description: "Dimension of latent variable z.\n"
|
||||
info: null
|
||||
default:
|
||||
- 64
|
||||
required: false
|
||||
direction: "input"
|
||||
multiple: false
|
||||
multiple_sep: ";"
|
||||
- type: "integer"
|
||||
name: "--z_layers"
|
||||
description: "Dimension of hidden layers in the encoder for z.\n"
|
||||
info: null
|
||||
default:
|
||||
- 512
|
||||
required: false
|
||||
direction: "input"
|
||||
multiple: true
|
||||
multiple_sep: ";"
|
||||
- type: "double"
|
||||
name: "--training_fraction"
|
||||
description: "Training detail: the fraction of the data used for training.\nThe\
|
||||
\ rest is never seen by the inference algorithm. Speeds up learning.\n"
|
||||
info: null
|
||||
default:
|
||||
- 0.9
|
||||
required: false
|
||||
direction: "input"
|
||||
multiple: false
|
||||
multiple_sep: ";"
|
||||
- type: "double"
|
||||
name: "--empty_drop_training_fraction"
|
||||
description: "Training detail: the fraction of the training data each epoch that\
|
||||
\ \nis drawn (randomly sampled) from surely empty droplets.\n"
|
||||
info: null
|
||||
default:
|
||||
- 0.2
|
||||
required: false
|
||||
direction: "input"
|
||||
multiple: false
|
||||
multiple_sep: ";"
|
||||
- type: "integer"
|
||||
name: "--ignore_features"
|
||||
description: "Integer indices of features to ignore entirely. In the output\n\
|
||||
count matrix, the counts for these features will be unchanged.\n"
|
||||
info: null
|
||||
required: false
|
||||
direction: "input"
|
||||
multiple: true
|
||||
multiple_sep: ";"
|
||||
- type: "double"
|
||||
name: "--fpr"
|
||||
description: "Target 'delta' false positive rate in [0, 1). Use 0 for a cohort\n\
|
||||
of samples which will be jointly analyzed for differential expression.\nA false\
|
||||
\ positive is a true signal count that is erroneously removed.\nMore background\
|
||||
\ removal is accompanied by more signal removal at\nhigh values of FPR. You\
|
||||
\ can specify multiple values, which will\ncreate multiple output files.\n"
|
||||
info: null
|
||||
default:
|
||||
- 0.01
|
||||
required: false
|
||||
direction: "input"
|
||||
multiple: true
|
||||
multiple_sep: ";"
|
||||
- type: "string"
|
||||
name: "--exclude_feature_types"
|
||||
description: "Feature types to ignore during the analysis. These features will\n\
|
||||
be left unchanged in the output file.\n"
|
||||
info: null
|
||||
required: false
|
||||
direction: "input"
|
||||
multiple: true
|
||||
multiple_sep: ";"
|
||||
- type: "double"
|
||||
name: "--projected_ambient_count_threshold"
|
||||
description: "Controls how many features are included in the analysis, which\n\
|
||||
can lead to a large speedup. If a feature is expected to have less\nthan PROJECTED_AMBIENT_COUNT_THRESHOLD\
|
||||
\ counts total in all cells\n(summed), then that gene is excluded, and it will\
|
||||
\ be unchanged\nin the output count matrix. For example, \nPROJECTED_AMBIENT_COUNT_THRESHOLD\
|
||||
\ = 0 will include all features\nwhich have even a single count in any empty\
|
||||
\ droplet.\n"
|
||||
info: null
|
||||
default:
|
||||
- 0.1
|
||||
required: false
|
||||
direction: "input"
|
||||
multiple: false
|
||||
multiple_sep: ";"
|
||||
- type: "double"
|
||||
name: "--learning_rate"
|
||||
description: "Training detail: lower learning rate for inference.\nA OneCycle\
|
||||
\ learning rate schedule is used, where the\nupper learning rate is ten times\
|
||||
\ this value. (For this\nvalue, probably do not exceed 1e-3).\n"
|
||||
info: null
|
||||
default:
|
||||
- 1.0E-4
|
||||
required: false
|
||||
direction: "input"
|
||||
multiple: false
|
||||
multiple_sep: ";"
|
||||
- type: "double"
|
||||
name: "--final_elbo_fail_fraction"
|
||||
description: "Training is considered to have failed if \n(best_test_ELBO - final_test_ELBO)/(best_test_ELBO\
|
||||
\ - initial_test_ELBO) > FINAL_ELBO_FAIL_FRACTION.\nTraining will automatically\
|
||||
\ re-run if --num-training-tries > 1.\nBy default, will not fail training based\
|
||||
\ on final_training_ELBO.\n"
|
||||
info: null
|
||||
required: false
|
||||
direction: "input"
|
||||
multiple: false
|
||||
multiple_sep: ";"
|
||||
- type: "double"
|
||||
name: "--epoch_elbo_fail_fraction"
|
||||
description: "Training is considered to have failed if \n(previous_epoch_test_ELBO\
|
||||
\ - current_epoch_test_ELBO)/(previous_epoch_test_ELBO - initial_train_ELBO)\
|
||||
\ > EPOCH_ELBO_FAIL_FRACTION.\nTraining will automatically re-run if --num-training-tries\
|
||||
\ > 1.\nBy default, will not fail training based on epoch_training_ELBO.\n"
|
||||
info: null
|
||||
required: false
|
||||
direction: "input"
|
||||
multiple: false
|
||||
multiple_sep: ";"
|
||||
- type: "integer"
|
||||
name: "--num_training_tries"
|
||||
description: "Number of times to attempt to train the model. At each subsequent\
|
||||
\ attempt,\nthe learning rate is multiplied by LEARNING_RATE_RETRY_MULT.\n"
|
||||
info: null
|
||||
default:
|
||||
- 1
|
||||
required: false
|
||||
direction: "input"
|
||||
multiple: false
|
||||
multiple_sep: ";"
|
||||
- type: "double"
|
||||
name: "--learning_rate_retry_mult"
|
||||
description: "Learning rate is multiplied by this amount each time a new training\n\
|
||||
attempt is made. (This parameter is only used if training fails based\non EPOCH_ELBO_FAIL_FRACTION\
|
||||
\ or FINAL_ELBO_FAIL_FRACTION and\nNUM_TRAINING_TRIES is > 1.) \n"
|
||||
info: null
|
||||
default:
|
||||
- 0.2
|
||||
required: false
|
||||
direction: "input"
|
||||
multiple: false
|
||||
multiple_sep: ";"
|
||||
- type: "integer"
|
||||
name: "--posterior_batch_size"
|
||||
description: "Training detail: size of batches when creating the posterior.\n\
|
||||
Reduce this to avoid running out of GPU memory creating the posterior\n(will\
|
||||
\ be slower).\n"
|
||||
info: null
|
||||
default:
|
||||
- 128
|
||||
required: false
|
||||
direction: "input"
|
||||
multiple: false
|
||||
multiple_sep: ";"
|
||||
- type: "string"
|
||||
name: "--posterior_regulation"
|
||||
description: "Posterior regularization method. (For experts: not required for\
|
||||
\ normal usage,\nsee documentation). \n\n* PRq is approximate quantile-targeting.\n\
|
||||
* PRmu is approximate mean-targeting aggregated over genes (behavior of v0.2.0).\n\
|
||||
* PRmu_gene is approximate mean-targeting per gene.\n"
|
||||
info: null
|
||||
required: false
|
||||
choices:
|
||||
- "PRq"
|
||||
- "PRmu"
|
||||
- "PRmu_gene"
|
||||
direction: "input"
|
||||
multiple: false
|
||||
multiple_sep: ";"
|
||||
- type: "double"
|
||||
name: "--alpha"
|
||||
description: "Tunable parameter alpha for the PRq posterior regularization method\n\
|
||||
(not normally used: see documentation).\n"
|
||||
info: null
|
||||
required: false
|
||||
direction: "input"
|
||||
multiple: false
|
||||
multiple_sep: ";"
|
||||
- type: "double"
|
||||
name: "--q"
|
||||
description: "Tunable parameter q for the CDF threshold estimation method (not\n\
|
||||
normally used: see documentation).\n"
|
||||
info: null
|
||||
required: false
|
||||
direction: "input"
|
||||
multiple: false
|
||||
multiple_sep: ";"
|
||||
- type: "string"
|
||||
name: "--estimator"
|
||||
description: "Output denoised count estimation method. (For experts: not required\n\
|
||||
for normal usage, see documentation).\n"
|
||||
info: null
|
||||
default:
|
||||
- "mckp"
|
||||
required: false
|
||||
choices:
|
||||
- "map"
|
||||
- "mean"
|
||||
- "cdf"
|
||||
- "sample"
|
||||
- "mckp"
|
||||
direction: "input"
|
||||
multiple: false
|
||||
multiple_sep: ";"
|
||||
- type: "boolean_true"
|
||||
name: "--estimator_multiple_cpu"
|
||||
description: "Including the flag --estimator-multiple-cpu will use more than one\n\
|
||||
CPU to compute the MCKP output count estimator in parallel (does nothing\nfor\
|
||||
\ other estimators).\n"
|
||||
info: null
|
||||
direction: "input"
|
||||
- type: "boolean"
|
||||
name: "--constant_learning_rate"
|
||||
description: "Including the flag --constant-learning-rate will use the ClippedAdam\n\
|
||||
optimizer instead of the OneCycleLR learning rate schedule, which is\nthe default.\
|
||||
\ Learning is faster with the OneCycleLR schedule.\nHowever, training can easily\
|
||||
\ be continued from a checkpoint for more\nepochs than the initial command specified\
|
||||
\ when using ClippedAdam. On\nthe other hand, if using the OneCycleLR schedule\
|
||||
\ with 150 epochs\nspecified, it is not possible to pick up from that final\
|
||||
\ checkpoint\nand continue training until 250 epochs.\n"
|
||||
info: null
|
||||
required: false
|
||||
direction: "input"
|
||||
multiple: false
|
||||
multiple_sep: ";"
|
||||
- type: "boolean_true"
|
||||
name: "--debug"
|
||||
description: "Including the flag --debug will log extra messages useful for debugging.\n"
|
||||
info: null
|
||||
direction: "input"
|
||||
- type: "boolean_true"
|
||||
name: "--cuda"
|
||||
description: "Including the flag --cuda will run the inference on a\nGPU.\n"
|
||||
info: null
|
||||
direction: "input"
|
||||
resources:
|
||||
- type: "python_script"
|
||||
path: "script.py"
|
||||
is_executable: true
|
||||
- type: "file"
|
||||
path: "setup_logger.py"
|
||||
- type: "file"
|
||||
path: "nextflow_labels.config"
|
||||
dest: "nextflow_labels.config"
|
||||
description: "Eliminating technical artifacts from high-throughput single-cell RNA\
|
||||
\ sequencing data.\n\nThis module removes counts due to ambient RNA molecules and\
|
||||
\ random barcode swapping from (raw) UMI-based scRNA-seq count matrices. \nAt the\
|
||||
\ moment, only the count matrices produced by the CellRanger count pipeline is supported.\
|
||||
\ Support for additional tools and protocols \nwill be added in the future. A quick\
|
||||
\ start tutorial can be found here.\n\nFleming et al. 2022, bioRxiv.\n"
|
||||
test_resources:
|
||||
- type: "python_script"
|
||||
path: "test.py"
|
||||
is_executable: true
|
||||
- type: "file"
|
||||
path: "pbmc_1k_protein_v3_filtered_feature_bc_matrix.h5mu"
|
||||
info: null
|
||||
status: "enabled"
|
||||
scope:
|
||||
image: "public"
|
||||
target: "public"
|
||||
license: "MIT"
|
||||
links:
|
||||
repository: "https://github.com/openpipelines-bio/openpipeline"
|
||||
docker_registry: "ghcr.io"
|
||||
runners:
|
||||
- type: "executable"
|
||||
id: "executable"
|
||||
docker_setup_strategy: "ifneedbepullelsecachedbuild"
|
||||
- type: "nextflow"
|
||||
id: "nextflow"
|
||||
directives:
|
||||
label:
|
||||
- "midcpu"
|
||||
- "midmem"
|
||||
- "gpu"
|
||||
tag: "$id"
|
||||
auto:
|
||||
simplifyInput: true
|
||||
simplifyOutput: false
|
||||
transcript: false
|
||||
publish: false
|
||||
config:
|
||||
labels:
|
||||
mem1gb: "memory = 1000000000.B"
|
||||
mem2gb: "memory = 2000000000.B"
|
||||
mem5gb: "memory = 5000000000.B"
|
||||
mem10gb: "memory = 10000000000.B"
|
||||
mem20gb: "memory = 20000000000.B"
|
||||
mem50gb: "memory = 50000000000.B"
|
||||
mem100gb: "memory = 100000000000.B"
|
||||
mem200gb: "memory = 200000000000.B"
|
||||
mem500gb: "memory = 500000000000.B"
|
||||
mem1tb: "memory = 1000000000000.B"
|
||||
mem2tb: "memory = 2000000000000.B"
|
||||
mem5tb: "memory = 5000000000000.B"
|
||||
mem10tb: "memory = 10000000000000.B"
|
||||
mem20tb: "memory = 20000000000000.B"
|
||||
mem50tb: "memory = 50000000000000.B"
|
||||
mem100tb: "memory = 100000000000000.B"
|
||||
mem200tb: "memory = 200000000000000.B"
|
||||
mem500tb: "memory = 500000000000000.B"
|
||||
mem1gib: "memory = 1073741824.B"
|
||||
mem2gib: "memory = 2147483648.B"
|
||||
mem4gib: "memory = 4294967296.B"
|
||||
mem8gib: "memory = 8589934592.B"
|
||||
mem16gib: "memory = 17179869184.B"
|
||||
mem32gib: "memory = 34359738368.B"
|
||||
mem64gib: "memory = 68719476736.B"
|
||||
mem128gib: "memory = 137438953472.B"
|
||||
mem256gib: "memory = 274877906944.B"
|
||||
mem512gib: "memory = 549755813888.B"
|
||||
mem1tib: "memory = 1099511627776.B"
|
||||
mem2tib: "memory = 2199023255552.B"
|
||||
mem4tib: "memory = 4398046511104.B"
|
||||
mem8tib: "memory = 8796093022208.B"
|
||||
mem16tib: "memory = 17592186044416.B"
|
||||
mem32tib: "memory = 35184372088832.B"
|
||||
mem64tib: "memory = 70368744177664.B"
|
||||
mem128tib: "memory = 140737488355328.B"
|
||||
mem256tib: "memory = 281474976710656.B"
|
||||
mem512tib: "memory = 562949953421312.B"
|
||||
cpu1: "cpus = 1"
|
||||
cpu2: "cpus = 2"
|
||||
cpu5: "cpus = 5"
|
||||
cpu10: "cpus = 10"
|
||||
cpu20: "cpus = 20"
|
||||
cpu50: "cpus = 50"
|
||||
cpu100: "cpus = 100"
|
||||
cpu200: "cpus = 200"
|
||||
cpu500: "cpus = 500"
|
||||
cpu1000: "cpus = 1000"
|
||||
script:
|
||||
- "includeConfig(\"nextflow_labels.config\")"
|
||||
debug: false
|
||||
container: "docker"
|
||||
engines:
|
||||
- type: "docker"
|
||||
id: "docker"
|
||||
image: "nvcr.io/nvidia/cuda:11.8.0-devel-ubuntu22.04"
|
||||
target_tag: "2.1.0"
|
||||
namespace_separator: "/"
|
||||
setup:
|
||||
- type: "docker"
|
||||
run:
|
||||
- "apt update && DEBIAN_FRONTEND=noninteractive apt install -y make build-essential\
|
||||
\ libssl-dev zlib1g-dev libbz2-dev libreadline-dev libsqlite3-dev wget ca-certificates\
|
||||
\ curl llvm libncurses5-dev xz-utils tk-dev libxml2-dev libxmlsec1-dev libffi-dev\
|
||||
\ liblzma-dev mecab-ipadic-utf8 git \\\n&& curl https://pyenv.run | bash \\\n\
|
||||
&& pyenv update \\\n&& pyenv install $PYTHON_VERSION \\\n&& pyenv global $PYTHON_VERSION\
|
||||
\ \\\n&& apt-get clean\n"
|
||||
env:
|
||||
- "PYENV_ROOT=\"/root/.pyenv\""
|
||||
- "PATH=\"$PYENV_ROOT/shims:$PYENV_ROOT/bin:$PATH\""
|
||||
- "PYTHON_VERSION=3.7.16"
|
||||
- type: "python"
|
||||
user: false
|
||||
packages:
|
||||
- "lxml~=4.8.0"
|
||||
- "mudata~=0.2.1"
|
||||
- "cellbender~=0.3.0"
|
||||
upgrade: true
|
||||
entrypoint: []
|
||||
cmd: null
|
||||
build_info:
|
||||
config: "src/correction/cellbender_remove_background/config.vsh.yaml"
|
||||
runner: "nextflow"
|
||||
engine: "docker"
|
||||
output: "target/nextflow/correction/cellbender_remove_background"
|
||||
executable: "target/nextflow/correction/cellbender_remove_background/main.nf"
|
||||
viash_version: "0.9.4"
|
||||
git_commit: "a0c9522486585774f76416150f8a3291409b5363"
|
||||
git_remote: "https://github.com/openpipelines-bio/openpipeline"
|
||||
git_tag: "2.1.1-2-ga0c95224865"
|
||||
package_config:
|
||||
name: "openpipeline"
|
||||
summary: "Best-practice workflows for single-cell multi-omics analyses.\n"
|
||||
description: "OpenPipelines are extensible single cell analysis pipelines for reproducible\
|
||||
\ and large-scale single cell processing using [Viash](https://viash.io) and [Nextflow](https://www.nextflow.io/).\n\
|
||||
\nIn terms of workflows, the following has been made available, but keep in mind\
|
||||
\ that\nindividual tools and functionality can be executed as standalone components\
|
||||
\ as well.\n\n * Demultiplexing: conversion of raw sequencing data to FASTQ objects.\n\
|
||||
\ * Ingestion: Read mapping and generating a count matrix.\n * Single sample\
|
||||
\ processing: cell filtering and doublet detection.\n * Multisample processing:\
|
||||
\ Count transformation, normalization, QC metric calulations.\n * Integration:\
|
||||
\ Clustering, integration and batch correction using single and multimodal methods.\n\
|
||||
\ * Downstream analysis workflows\n"
|
||||
info:
|
||||
test_resources:
|
||||
- type: "s3"
|
||||
path: "s3://openpipelines-data"
|
||||
dest: "resources_test"
|
||||
viash_version: "0.9.4"
|
||||
source: "src"
|
||||
target: "target"
|
||||
config_mods:
|
||||
- ".resources += {path: '/src/workflows/utils/labels.config', dest: 'nextflow_labels.config'}\n\
|
||||
.runners[.type == 'nextflow'].config.script := 'includeConfig(\"nextflow_labels.config\"\
|
||||
)'"
|
||||
- ".version := \"2.1.2\""
|
||||
- ".engines[.type == 'docker'].target_tag := '2.1.0'"
|
||||
keywords:
|
||||
- "single-cell"
|
||||
- "multimodal"
|
||||
license: "MIT"
|
||||
organization: "openpipelines-bio"
|
||||
links:
|
||||
repository: "https://github.com/openpipelines-bio/openpipeline"
|
||||
docker_registry: "ghcr.io"
|
||||
homepage: "https://openpipelines.bio"
|
||||
documentation: "https://openpipelines.bio/fundamentals"
|
||||
issue_tracker: "https://github.com/openpipelines-bio/openpipeline/issues"
|
||||
File diff suppressed because it is too large
Load Diff
@@ -0,0 +1,125 @@
|
||||
manifest {
|
||||
name = 'correction/cellbender_remove_background'
|
||||
mainScript = 'main.nf'
|
||||
nextflowVersion = '!>=20.12.1-edge'
|
||||
version = '2.1.2'
|
||||
description = 'Eliminating technical artifacts from high-throughput single-cell RNA sequencing data.\n\nThis module removes counts due to ambient RNA molecules and random barcode swapping from (raw) UMI-based scRNA-seq count matrices. \nAt the moment, only the count matrices produced by the CellRanger count pipeline is supported. Support for additional tools and protocols \nwill be added in the future. A quick start tutorial can be found here.\n\nFleming et al. 2022, bioRxiv.\n'
|
||||
}
|
||||
|
||||
process.container = 'nextflow/bash:latest'
|
||||
|
||||
// detect tempdir
|
||||
tempDir = java.nio.file.Paths.get(
|
||||
System.getenv('NXF_TEMP') ?:
|
||||
System.getenv('VIASH_TEMP') ?:
|
||||
System.getenv('TEMPDIR') ?:
|
||||
System.getenv('TMPDIR') ?:
|
||||
'/tmp'
|
||||
).toAbsolutePath()
|
||||
|
||||
profiles {
|
||||
no_publish {
|
||||
process {
|
||||
withName: '.*' {
|
||||
publishDir = [
|
||||
enabled: false
|
||||
]
|
||||
}
|
||||
}
|
||||
}
|
||||
mount_temp {
|
||||
docker.temp = tempDir
|
||||
podman.temp = tempDir
|
||||
charliecloud.temp = tempDir
|
||||
}
|
||||
docker {
|
||||
docker.enabled = true
|
||||
// docker.userEmulation = true
|
||||
singularity.enabled = false
|
||||
podman.enabled = false
|
||||
shifter.enabled = false
|
||||
charliecloud.enabled = false
|
||||
}
|
||||
singularity {
|
||||
singularity.enabled = true
|
||||
singularity.autoMounts = true
|
||||
docker.enabled = false
|
||||
podman.enabled = false
|
||||
shifter.enabled = false
|
||||
charliecloud.enabled = false
|
||||
}
|
||||
podman {
|
||||
podman.enabled = true
|
||||
docker.enabled = false
|
||||
singularity.enabled = false
|
||||
shifter.enabled = false
|
||||
charliecloud.enabled = false
|
||||
}
|
||||
shifter {
|
||||
shifter.enabled = true
|
||||
docker.enabled = false
|
||||
singularity.enabled = false
|
||||
podman.enabled = false
|
||||
charliecloud.enabled = false
|
||||
}
|
||||
charliecloud {
|
||||
charliecloud.enabled = true
|
||||
docker.enabled = false
|
||||
singularity.enabled = false
|
||||
podman.enabled = false
|
||||
shifter.enabled = false
|
||||
}
|
||||
}
|
||||
|
||||
process{
|
||||
withLabel: mem1gb { memory = 1000000000.B }
|
||||
withLabel: mem2gb { memory = 2000000000.B }
|
||||
withLabel: mem5gb { memory = 5000000000.B }
|
||||
withLabel: mem10gb { memory = 10000000000.B }
|
||||
withLabel: mem20gb { memory = 20000000000.B }
|
||||
withLabel: mem50gb { memory = 50000000000.B }
|
||||
withLabel: mem100gb { memory = 100000000000.B }
|
||||
withLabel: mem200gb { memory = 200000000000.B }
|
||||
withLabel: mem500gb { memory = 500000000000.B }
|
||||
withLabel: mem1tb { memory = 1000000000000.B }
|
||||
withLabel: mem2tb { memory = 2000000000000.B }
|
||||
withLabel: mem5tb { memory = 5000000000000.B }
|
||||
withLabel: mem10tb { memory = 10000000000000.B }
|
||||
withLabel: mem20tb { memory = 20000000000000.B }
|
||||
withLabel: mem50tb { memory = 50000000000000.B }
|
||||
withLabel: mem100tb { memory = 100000000000000.B }
|
||||
withLabel: mem200tb { memory = 200000000000000.B }
|
||||
withLabel: mem500tb { memory = 500000000000000.B }
|
||||
withLabel: mem1gib { memory = 1073741824.B }
|
||||
withLabel: mem2gib { memory = 2147483648.B }
|
||||
withLabel: mem4gib { memory = 4294967296.B }
|
||||
withLabel: mem8gib { memory = 8589934592.B }
|
||||
withLabel: mem16gib { memory = 17179869184.B }
|
||||
withLabel: mem32gib { memory = 34359738368.B }
|
||||
withLabel: mem64gib { memory = 68719476736.B }
|
||||
withLabel: mem128gib { memory = 137438953472.B }
|
||||
withLabel: mem256gib { memory = 274877906944.B }
|
||||
withLabel: mem512gib { memory = 549755813888.B }
|
||||
withLabel: mem1tib { memory = 1099511627776.B }
|
||||
withLabel: mem2tib { memory = 2199023255552.B }
|
||||
withLabel: mem4tib { memory = 4398046511104.B }
|
||||
withLabel: mem8tib { memory = 8796093022208.B }
|
||||
withLabel: mem16tib { memory = 17592186044416.B }
|
||||
withLabel: mem32tib { memory = 35184372088832.B }
|
||||
withLabel: mem64tib { memory = 70368744177664.B }
|
||||
withLabel: mem128tib { memory = 140737488355328.B }
|
||||
withLabel: mem256tib { memory = 281474976710656.B }
|
||||
withLabel: mem512tib { memory = 562949953421312.B }
|
||||
withLabel: cpu1 { cpus = 1 }
|
||||
withLabel: cpu2 { cpus = 2 }
|
||||
withLabel: cpu5 { cpus = 5 }
|
||||
withLabel: cpu10 { cpus = 10 }
|
||||
withLabel: cpu20 { cpus = 20 }
|
||||
withLabel: cpu50 { cpus = 50 }
|
||||
withLabel: cpu100 { cpus = 100 }
|
||||
withLabel: cpu200 { cpus = 200 }
|
||||
withLabel: cpu500 { cpus = 500 }
|
||||
withLabel: cpu1000 { cpus = 1000 }
|
||||
}
|
||||
|
||||
includeConfig("nextflow_labels.config")
|
||||
@@ -0,0 +1,66 @@
|
||||
process {
|
||||
// Default resources for components that hardly do any processing
|
||||
memory = { 2.GB * task.attempt }
|
||||
cpus = 1
|
||||
|
||||
// Retry for exit codes that have something to do with memory issues
|
||||
errorStrategy = { task.exitStatus in 137..140 ? 'retry' : 'terminate' }
|
||||
maxRetries = 3
|
||||
maxMemory = null
|
||||
|
||||
// CPU resources
|
||||
withLabel: singlecpu { cpus = 1 }
|
||||
withLabel: lowcpu { cpus = 4 }
|
||||
withLabel: midcpu { cpus = 10 }
|
||||
withLabel: highcpu { cpus = 20 }
|
||||
|
||||
// Memory resources
|
||||
withLabel: lowmem { memory = { get_memory( 4.GB * task.attempt ) } }
|
||||
withLabel: midmem { memory = { get_memory( 25.GB * task.attempt ) } }
|
||||
withLabel: highmem { memory = { get_memory( 50.GB * task.attempt ) } }
|
||||
withLabel: veryhighmem { memory = { get_memory( 75.GB * task.attempt ) } }
|
||||
|
||||
// Disk space
|
||||
withLabel: lowdisk {
|
||||
disk = {process.disk ? process.disk : null}
|
||||
}
|
||||
withLabel: middisk {
|
||||
disk = {process.disk ? process.disk : null}
|
||||
}
|
||||
withLabel: highdisk {
|
||||
disk = {process.disk ? process.disk : null}
|
||||
}
|
||||
withLabel: veryhighdisk {
|
||||
disk = {process.disk ? process.disk : null}
|
||||
}
|
||||
|
||||
// NOTE: The above labels intentionally do not have an effect by default.
|
||||
// The user should set the disk space requirements by adding the following
|
||||
// to the compute environment:
|
||||
//
|
||||
// withLabel: lowdisk { disk = { 20.GB * task.attempt } }
|
||||
// withLabel: middisk { disk = { 100.GB * task.attempt } }
|
||||
// withLabel: highdisk { disk = { 200.GB * task.attempt } }
|
||||
// withLabel: veryhighdisk { disk = { 500.GB * task.attempt } }
|
||||
}
|
||||
|
||||
def get_memory(to_compare) {
|
||||
if (!process.containsKey("maxMemory") || !process.maxMemory) {
|
||||
return to_compare
|
||||
}
|
||||
|
||||
try {
|
||||
if (process.containsKey("maxRetries") && process.maxRetries && task.attempt == (process.maxRetries as int)) {
|
||||
return process.maxMemory
|
||||
}
|
||||
else if (to_compare.compareTo(process.maxMemory as nextflow.util.MemoryUnit) == 1) {
|
||||
return max_memory as nextflow.util.MemoryUnit
|
||||
}
|
||||
else {
|
||||
return to_compare
|
||||
}
|
||||
} catch (all) {
|
||||
println "Error processing memory resources. Please check that process.maxMemory '${process.maxMemory}' and process.maxRetries '${process.maxRetries}' are valid!"
|
||||
System.exit(1)
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,51 @@
|
||||
# Inputs
|
||||
input: # please fill in - example: "input.h5mu"
|
||||
modality: "rna"
|
||||
|
||||
# Outputs
|
||||
# output: "$id.$key.output.h5mu"
|
||||
# output_compression: "gzip"
|
||||
layer_output: "cellbender_corrected"
|
||||
obs_background_fraction: "cellbender_background_fraction"
|
||||
obs_cell_probability: "cellbender_cell_probability"
|
||||
obs_cell_size: "cellbender_cell_size"
|
||||
obs_droplet_efficiency: "cellbender_droplet_efficiency"
|
||||
obs_latent_scale: "cellbender_latent_scale"
|
||||
var_ambient_expression: "cellbender_ambient_expression"
|
||||
obsm_gene_expression_encoding: "cellbender_gene_expression_encoding"
|
||||
|
||||
# Arguments
|
||||
expected_cells_from_qc: false
|
||||
# expected_cells: 1000
|
||||
# total_droplets_included: 25000
|
||||
# force_cell_umi_prior: 123
|
||||
# force_empty_umi_prior: 123
|
||||
model: "full"
|
||||
epochs: 150
|
||||
low_count_threshold: 5
|
||||
z_dim: 64
|
||||
z_layers: [512]
|
||||
training_fraction: 0.9
|
||||
empty_drop_training_fraction: 0.2
|
||||
# ignore_features: [123]
|
||||
fpr: [0.01]
|
||||
# exclude_feature_types: ["foo"]
|
||||
projected_ambient_count_threshold: 0.1
|
||||
learning_rate: 1.0E-4
|
||||
# final_elbo_fail_fraction: 123.0
|
||||
# epoch_elbo_fail_fraction: 123.0
|
||||
num_training_tries: 1
|
||||
learning_rate_retry_mult: 0.2
|
||||
posterior_batch_size: 128
|
||||
# posterior_regulation: "foo"
|
||||
# alpha: 123.0
|
||||
# q: 123.0
|
||||
estimator: "mckp"
|
||||
estimator_multiple_cpu: false
|
||||
# constant_learning_rate: true
|
||||
debug: false
|
||||
cuda: false
|
||||
|
||||
# Nextflow input-output arguments
|
||||
publish_dir: # please fill in - example: "output/"
|
||||
# param_list: "my_params.yaml"
|
||||
@@ -0,0 +1,551 @@
|
||||
{
|
||||
"$schema": "http://json-schema.org/draft-07/schema",
|
||||
"title": "cellbender_remove_background",
|
||||
"description": "Eliminating technical artifacts from high-throughput single-cell RNA sequencing data.\n\nThis module removes counts due to ambient RNA molecules and random barcode swapping from (raw) UMI-based scRNA-seq count matrices. \nAt the moment, only the count matrices produced by the CellRanger count pipeline is supported. Support for additional tools and protocols \nwill be added in the future. A quick start tutorial can be found here.\n\nFleming et al. 2022, bioRxiv.\n",
|
||||
"type": "object",
|
||||
"definitions": {
|
||||
|
||||
|
||||
"Dataset input": {
|
||||
"title": "Dataset input",
|
||||
"type": "object",
|
||||
"description": "Dataset input using nf-tower \"dataset\" or \"data explorer\". Allows for the input of multiple parameter sets to initialise a Nextflow channel.",
|
||||
"properties": {
|
||||
"param_list": {
|
||||
"description": "Dataset input can either be a list of maps, a csv file, a json file, a yaml file, or simply a yaml blob. The names of the input fields (e.g. csv columns, json keys) need to be an exact match with the workflow input parameters.",
|
||||
"default": "",
|
||||
"format": "file-path",
|
||||
"mimetype": "text/csv",
|
||||
"pattern": "^\\S+\\.csv$"
|
||||
}
|
||||
}
|
||||
},
|
||||
|
||||
|
||||
|
||||
"inputs" : {
|
||||
"title": "Inputs",
|
||||
"type": "object",
|
||||
"description": "No description",
|
||||
"properties": {
|
||||
|
||||
|
||||
"input": {
|
||||
"type":
|
||||
"string",
|
||||
"description": "Type: `file`, required, example: `input.h5mu`. Input h5mu file",
|
||||
"help_text": "Type: `file`, required, example: `input.h5mu`. Input h5mu file. Data file on which to run tool. Data must be un-filtered: it should include empty droplets."
|
||||
|
||||
}
|
||||
|
||||
|
||||
,
|
||||
"modality": {
|
||||
"type":
|
||||
"string",
|
||||
"description": "Type: `string`, default: `rna`. List of modalities to process",
|
||||
"help_text": "Type: `string`, default: `rna`. List of modalities to process."
|
||||
,
|
||||
"default":"rna"
|
||||
}
|
||||
|
||||
|
||||
}
|
||||
},
|
||||
|
||||
|
||||
"outputs" : {
|
||||
"title": "Outputs",
|
||||
"type": "object",
|
||||
"description": "No description",
|
||||
"properties": {
|
||||
|
||||
|
||||
"output": {
|
||||
"type":
|
||||
"string",
|
||||
"description": "Type: `file`, required, default: `$id.$key.output.h5mu`, example: `output.h5mu`. Full count matrix as an h5mu file, with background RNA removed",
|
||||
"help_text": "Type: `file`, required, default: `$id.$key.output.h5mu`, example: `output.h5mu`. Full count matrix as an h5mu file, with background RNA removed. This file contains all the original droplet barcodes."
|
||||
,
|
||||
"default":"$id.$key.output.h5mu"
|
||||
}
|
||||
|
||||
|
||||
,
|
||||
"output_compression": {
|
||||
"type":
|
||||
"string",
|
||||
"description": "Type: `string`, example: `gzip`, choices: ``gzip`, `lzf``. ",
|
||||
"help_text": "Type: `string`, example: `gzip`, choices: ``gzip`, `lzf``. ",
|
||||
"enum": ["gzip", "lzf"]
|
||||
|
||||
|
||||
}
|
||||
|
||||
|
||||
,
|
||||
"layer_output": {
|
||||
"type":
|
||||
"string",
|
||||
"description": "Type: `string`, default: `cellbender_corrected`. Output layer",
|
||||
"help_text": "Type: `string`, default: `cellbender_corrected`. Output layer"
|
||||
,
|
||||
"default":"cellbender_corrected"
|
||||
}
|
||||
|
||||
|
||||
,
|
||||
"obs_background_fraction": {
|
||||
"type":
|
||||
"string",
|
||||
"description": "Type: `string`, default: `cellbender_background_fraction`. ",
|
||||
"help_text": "Type: `string`, default: `cellbender_background_fraction`. "
|
||||
,
|
||||
"default":"cellbender_background_fraction"
|
||||
}
|
||||
|
||||
|
||||
,
|
||||
"obs_cell_probability": {
|
||||
"type":
|
||||
"string",
|
||||
"description": "Type: `string`, default: `cellbender_cell_probability`. ",
|
||||
"help_text": "Type: `string`, default: `cellbender_cell_probability`. "
|
||||
,
|
||||
"default":"cellbender_cell_probability"
|
||||
}
|
||||
|
||||
|
||||
,
|
||||
"obs_cell_size": {
|
||||
"type":
|
||||
"string",
|
||||
"description": "Type: `string`, default: `cellbender_cell_size`. ",
|
||||
"help_text": "Type: `string`, default: `cellbender_cell_size`. "
|
||||
,
|
||||
"default":"cellbender_cell_size"
|
||||
}
|
||||
|
||||
|
||||
,
|
||||
"obs_droplet_efficiency": {
|
||||
"type":
|
||||
"string",
|
||||
"description": "Type: `string`, default: `cellbender_droplet_efficiency`. ",
|
||||
"help_text": "Type: `string`, default: `cellbender_droplet_efficiency`. "
|
||||
,
|
||||
"default":"cellbender_droplet_efficiency"
|
||||
}
|
||||
|
||||
|
||||
,
|
||||
"obs_latent_scale": {
|
||||
"type":
|
||||
"string",
|
||||
"description": "Type: `string`, default: `cellbender_latent_scale`. ",
|
||||
"help_text": "Type: `string`, default: `cellbender_latent_scale`. "
|
||||
,
|
||||
"default":"cellbender_latent_scale"
|
||||
}
|
||||
|
||||
|
||||
,
|
||||
"var_ambient_expression": {
|
||||
"type":
|
||||
"string",
|
||||
"description": "Type: `string`, default: `cellbender_ambient_expression`. ",
|
||||
"help_text": "Type: `string`, default: `cellbender_ambient_expression`. "
|
||||
,
|
||||
"default":"cellbender_ambient_expression"
|
||||
}
|
||||
|
||||
|
||||
,
|
||||
"obsm_gene_expression_encoding": {
|
||||
"type":
|
||||
"string",
|
||||
"description": "Type: `string`, default: `cellbender_gene_expression_encoding`. ",
|
||||
"help_text": "Type: `string`, default: `cellbender_gene_expression_encoding`. "
|
||||
,
|
||||
"default":"cellbender_gene_expression_encoding"
|
||||
}
|
||||
|
||||
|
||||
}
|
||||
},
|
||||
|
||||
|
||||
"arguments" : {
|
||||
"title": "Arguments",
|
||||
"type": "object",
|
||||
"description": "No description",
|
||||
"properties": {
|
||||
|
||||
|
||||
"expected_cells_from_qc": {
|
||||
"type":
|
||||
"boolean",
|
||||
"description": "Type: `boolean`, default: `false`. Will use the Cell Ranger QC to determine the estimated number of cells",
|
||||
"help_text": "Type: `boolean`, default: `false`. Will use the Cell Ranger QC to determine the estimated number of cells"
|
||||
,
|
||||
"default":false
|
||||
}
|
||||
|
||||
|
||||
,
|
||||
"expected_cells": {
|
||||
"type":
|
||||
"integer",
|
||||
"description": "Type: `integer`, example: `1000`. Number of cells expected in the dataset (a rough estimate within a factor of 2 is sufficient)",
|
||||
"help_text": "Type: `integer`, example: `1000`. Number of cells expected in the dataset (a rough estimate within a factor of 2 is sufficient)."
|
||||
|
||||
}
|
||||
|
||||
|
||||
,
|
||||
"total_droplets_included": {
|
||||
"type":
|
||||
"integer",
|
||||
"description": "Type: `integer`, example: `25000`. The number of droplets from the rank-ordered UMI plot\nthat will have their cell probabilities inferred as an\noutput",
|
||||
"help_text": "Type: `integer`, example: `25000`. The number of droplets from the rank-ordered UMI plot\nthat will have their cell probabilities inferred as an\noutput. Include the droplets which might contain cells.\nDroplets beyond TOTAL_DROPLETS_INCLUDED should be\n\u0027surely empty\u0027 droplets.\n"
|
||||
|
||||
}
|
||||
|
||||
|
||||
,
|
||||
"force_cell_umi_prior": {
|
||||
"type":
|
||||
"integer",
|
||||
"description": "Type: `integer`. Ignore CellBender\u0027s heuristic prior estimation, and use this prior for UMI counts in cells",
|
||||
"help_text": "Type: `integer`. Ignore CellBender\u0027s heuristic prior estimation, and use this prior for UMI counts in cells."
|
||||
|
||||
}
|
||||
|
||||
|
||||
,
|
||||
"force_empty_umi_prior": {
|
||||
"type":
|
||||
"integer",
|
||||
"description": "Type: `integer`. Ignore CellBender\u0027s heuristic prior estimation, and use this prior for UMI counts in empty droplets",
|
||||
"help_text": "Type: `integer`. Ignore CellBender\u0027s heuristic prior estimation, and use this prior for UMI counts in empty droplets."
|
||||
|
||||
}
|
||||
|
||||
|
||||
,
|
||||
"model": {
|
||||
"type":
|
||||
"string",
|
||||
"description": "Type: `string`, default: `full`, choices: ``naive`, `simple`, `ambient`, `swapping`, `full``. Which model is being used for count data",
|
||||
"help_text": "Type: `string`, default: `full`, choices: ``naive`, `simple`, `ambient`, `swapping`, `full``. Which model is being used for count data.\n\n* \u0027naive\u0027 subtracts the estimated ambient profile.\n* \u0027simple\u0027 does not model either ambient RNA or random barcode swapping (for debugging purposes -- not recommended).\n* \u0027ambient\u0027 assumes background RNA is incorporated into droplets.\n* \u0027swapping\u0027 assumes background RNA comes from random barcode swapping (via PCR chimeras).\n* \u0027full\u0027 uses a combined ambient and swapping model.\n",
|
||||
"enum": ["naive", "simple", "ambient", "swapping", "full"]
|
||||
|
||||
,
|
||||
"default":"full"
|
||||
}
|
||||
|
||||
|
||||
,
|
||||
"epochs": {
|
||||
"type":
|
||||
"integer",
|
||||
"description": "Type: `integer`, default: `150`. Number of epochs to train",
|
||||
"help_text": "Type: `integer`, default: `150`. Number of epochs to train."
|
||||
,
|
||||
"default":150
|
||||
}
|
||||
|
||||
|
||||
,
|
||||
"low_count_threshold": {
|
||||
"type":
|
||||
"integer",
|
||||
"description": "Type: `integer`, default: `5`. Droplets with UMI counts below this number are completely \nexcluded from the analysis",
|
||||
"help_text": "Type: `integer`, default: `5`. Droplets with UMI counts below this number are completely \nexcluded from the analysis. This can help identify the correct \nprior for empty droplet counts in the rare case where empty \ncounts are extremely high (over 200).\n"
|
||||
,
|
||||
"default":5
|
||||
}
|
||||
|
||||
|
||||
,
|
||||
"z_dim": {
|
||||
"type":
|
||||
"integer",
|
||||
"description": "Type: `integer`, default: `64`. Dimension of latent variable z",
|
||||
"help_text": "Type: `integer`, default: `64`. Dimension of latent variable z.\n"
|
||||
,
|
||||
"default":64
|
||||
}
|
||||
|
||||
|
||||
,
|
||||
"z_layers": {
|
||||
"type":
|
||||
"string",
|
||||
"description": "Type: List of `integer`, default: `512`, multiple_sep: `\";\"`. Dimension of hidden layers in the encoder for z",
|
||||
"help_text": "Type: List of `integer`, default: `512`, multiple_sep: `\";\"`. Dimension of hidden layers in the encoder for z.\n"
|
||||
,
|
||||
"default":"512"
|
||||
}
|
||||
|
||||
|
||||
,
|
||||
"training_fraction": {
|
||||
"type":
|
||||
"number",
|
||||
"description": "Type: `double`, default: `0.9`. Training detail: the fraction of the data used for training",
|
||||
"help_text": "Type: `double`, default: `0.9`. Training detail: the fraction of the data used for training.\nThe rest is never seen by the inference algorithm. Speeds up learning.\n"
|
||||
,
|
||||
"default":0.9
|
||||
}
|
||||
|
||||
|
||||
,
|
||||
"empty_drop_training_fraction": {
|
||||
"type":
|
||||
"number",
|
||||
"description": "Type: `double`, default: `0.2`. Training detail: the fraction of the training data each epoch that \nis drawn (randomly sampled) from surely empty droplets",
|
||||
"help_text": "Type: `double`, default: `0.2`. Training detail: the fraction of the training data each epoch that \nis drawn (randomly sampled) from surely empty droplets.\n"
|
||||
,
|
||||
"default":0.2
|
||||
}
|
||||
|
||||
|
||||
,
|
||||
"ignore_features": {
|
||||
"type":
|
||||
"string",
|
||||
"description": "Type: List of `integer`, multiple_sep: `\";\"`. Integer indices of features to ignore entirely",
|
||||
"help_text": "Type: List of `integer`, multiple_sep: `\";\"`. Integer indices of features to ignore entirely. In the output\ncount matrix, the counts for these features will be unchanged.\n"
|
||||
|
||||
}
|
||||
|
||||
|
||||
,
|
||||
"fpr": {
|
||||
"type":
|
||||
"string",
|
||||
"description": "Type: List of `double`, default: `0.01`, multiple_sep: `\";\"`. Target \u0027delta\u0027 false positive rate in [0, 1)",
|
||||
"help_text": "Type: List of `double`, default: `0.01`, multiple_sep: `\";\"`. Target \u0027delta\u0027 false positive rate in [0, 1). Use 0 for a cohort\nof samples which will be jointly analyzed for differential expression.\nA false positive is a true signal count that is erroneously removed.\nMore background removal is accompanied by more signal removal at\nhigh values of FPR. You can specify multiple values, which will\ncreate multiple output files.\n"
|
||||
,
|
||||
"default":"0.01"
|
||||
}
|
||||
|
||||
|
||||
,
|
||||
"exclude_feature_types": {
|
||||
"type":
|
||||
"string",
|
||||
"description": "Type: List of `string`, multiple_sep: `\";\"`. Feature types to ignore during the analysis",
|
||||
"help_text": "Type: List of `string`, multiple_sep: `\";\"`. Feature types to ignore during the analysis. These features will\nbe left unchanged in the output file.\n"
|
||||
|
||||
}
|
||||
|
||||
|
||||
,
|
||||
"projected_ambient_count_threshold": {
|
||||
"type":
|
||||
"number",
|
||||
"description": "Type: `double`, default: `0.1`. Controls how many features are included in the analysis, which\ncan lead to a large speedup",
|
||||
"help_text": "Type: `double`, default: `0.1`. Controls how many features are included in the analysis, which\ncan lead to a large speedup. If a feature is expected to have less\nthan PROJECTED_AMBIENT_COUNT_THRESHOLD counts total in all cells\n(summed), then that gene is excluded, and it will be unchanged\nin the output count matrix. For example, \nPROJECTED_AMBIENT_COUNT_THRESHOLD = 0 will include all features\nwhich have even a single count in any empty droplet.\n"
|
||||
,
|
||||
"default":0.1
|
||||
}
|
||||
|
||||
|
||||
,
|
||||
"learning_rate": {
|
||||
"type":
|
||||
"number",
|
||||
"description": "Type: `double`, default: `1.0E-4`. Training detail: lower learning rate for inference",
|
||||
"help_text": "Type: `double`, default: `1.0E-4`. Training detail: lower learning rate for inference.\nA OneCycle learning rate schedule is used, where the\nupper learning rate is ten times this value. (For this\nvalue, probably do not exceed 1e-3).\n"
|
||||
,
|
||||
"default":0.0001
|
||||
}
|
||||
|
||||
|
||||
,
|
||||
"final_elbo_fail_fraction": {
|
||||
"type":
|
||||
"number",
|
||||
"description": "Type: `double`. Training is considered to have failed if \n(best_test_ELBO - final_test_ELBO)/(best_test_ELBO - initial_test_ELBO) \u003e FINAL_ELBO_FAIL_FRACTION",
|
||||
"help_text": "Type: `double`. Training is considered to have failed if \n(best_test_ELBO - final_test_ELBO)/(best_test_ELBO - initial_test_ELBO) \u003e FINAL_ELBO_FAIL_FRACTION.\nTraining will automatically re-run if --num-training-tries \u003e 1.\nBy default, will not fail training based on final_training_ELBO.\n"
|
||||
|
||||
}
|
||||
|
||||
|
||||
,
|
||||
"epoch_elbo_fail_fraction": {
|
||||
"type":
|
||||
"number",
|
||||
"description": "Type: `double`. Training is considered to have failed if \n(previous_epoch_test_ELBO - current_epoch_test_ELBO)/(previous_epoch_test_ELBO - initial_train_ELBO) \u003e EPOCH_ELBO_FAIL_FRACTION",
|
||||
"help_text": "Type: `double`. Training is considered to have failed if \n(previous_epoch_test_ELBO - current_epoch_test_ELBO)/(previous_epoch_test_ELBO - initial_train_ELBO) \u003e EPOCH_ELBO_FAIL_FRACTION.\nTraining will automatically re-run if --num-training-tries \u003e 1.\nBy default, will not fail training based on epoch_training_ELBO.\n"
|
||||
|
||||
}
|
||||
|
||||
|
||||
,
|
||||
"num_training_tries": {
|
||||
"type":
|
||||
"integer",
|
||||
"description": "Type: `integer`, default: `1`. Number of times to attempt to train the model",
|
||||
"help_text": "Type: `integer`, default: `1`. Number of times to attempt to train the model. At each subsequent attempt,\nthe learning rate is multiplied by LEARNING_RATE_RETRY_MULT.\n"
|
||||
,
|
||||
"default":1
|
||||
}
|
||||
|
||||
|
||||
,
|
||||
"learning_rate_retry_mult": {
|
||||
"type":
|
||||
"number",
|
||||
"description": "Type: `double`, default: `0.2`. Learning rate is multiplied by this amount each time a new training\nattempt is made",
|
||||
"help_text": "Type: `double`, default: `0.2`. Learning rate is multiplied by this amount each time a new training\nattempt is made. (This parameter is only used if training fails based\non EPOCH_ELBO_FAIL_FRACTION or FINAL_ELBO_FAIL_FRACTION and\nNUM_TRAINING_TRIES is \u003e 1.) \n"
|
||||
,
|
||||
"default":0.2
|
||||
}
|
||||
|
||||
|
||||
,
|
||||
"posterior_batch_size": {
|
||||
"type":
|
||||
"integer",
|
||||
"description": "Type: `integer`, default: `128`. Training detail: size of batches when creating the posterior",
|
||||
"help_text": "Type: `integer`, default: `128`. Training detail: size of batches when creating the posterior.\nReduce this to avoid running out of GPU memory creating the posterior\n(will be slower).\n"
|
||||
,
|
||||
"default":128
|
||||
}
|
||||
|
||||
|
||||
,
|
||||
"posterior_regulation": {
|
||||
"type":
|
||||
"string",
|
||||
"description": "Type: `string`, choices: ``PRq`, `PRmu`, `PRmu_gene``. Posterior regularization method",
|
||||
"help_text": "Type: `string`, choices: ``PRq`, `PRmu`, `PRmu_gene``. Posterior regularization method. (For experts: not required for normal usage,\nsee documentation). \n\n* PRq is approximate quantile-targeting.\n* PRmu is approximate mean-targeting aggregated over genes (behavior of v0.2.0).\n* PRmu_gene is approximate mean-targeting per gene.\n",
|
||||
"enum": ["PRq", "PRmu", "PRmu_gene"]
|
||||
|
||||
|
||||
}
|
||||
|
||||
|
||||
,
|
||||
"alpha": {
|
||||
"type":
|
||||
"number",
|
||||
"description": "Type: `double`. Tunable parameter alpha for the PRq posterior regularization method\n(not normally used: see documentation)",
|
||||
"help_text": "Type: `double`. Tunable parameter alpha for the PRq posterior regularization method\n(not normally used: see documentation).\n"
|
||||
|
||||
}
|
||||
|
||||
|
||||
,
|
||||
"q": {
|
||||
"type":
|
||||
"number",
|
||||
"description": "Type: `double`. Tunable parameter q for the CDF threshold estimation method (not\nnormally used: see documentation)",
|
||||
"help_text": "Type: `double`. Tunable parameter q for the CDF threshold estimation method (not\nnormally used: see documentation).\n"
|
||||
|
||||
}
|
||||
|
||||
|
||||
,
|
||||
"estimator": {
|
||||
"type":
|
||||
"string",
|
||||
"description": "Type: `string`, default: `mckp`, choices: ``map`, `mean`, `cdf`, `sample`, `mckp``. Output denoised count estimation method",
|
||||
"help_text": "Type: `string`, default: `mckp`, choices: ``map`, `mean`, `cdf`, `sample`, `mckp``. Output denoised count estimation method. (For experts: not required\nfor normal usage, see documentation).\n",
|
||||
"enum": ["map", "mean", "cdf", "sample", "mckp"]
|
||||
|
||||
,
|
||||
"default":"mckp"
|
||||
}
|
||||
|
||||
|
||||
,
|
||||
"estimator_multiple_cpu": {
|
||||
"type":
|
||||
"boolean",
|
||||
"description": "Type: `boolean_true`, default: `false`. Including the flag --estimator-multiple-cpu will use more than one\nCPU to compute the MCKP output count estimator in parallel (does nothing\nfor other estimators)",
|
||||
"help_text": "Type: `boolean_true`, default: `false`. Including the flag --estimator-multiple-cpu will use more than one\nCPU to compute the MCKP output count estimator in parallel (does nothing\nfor other estimators).\n"
|
||||
,
|
||||
"default":false
|
||||
}
|
||||
|
||||
|
||||
,
|
||||
"constant_learning_rate": {
|
||||
"type":
|
||||
"boolean",
|
||||
"description": "Type: `boolean`. Including the flag --constant-learning-rate will use the ClippedAdam\noptimizer instead of the OneCycleLR learning rate schedule, which is\nthe default",
|
||||
"help_text": "Type: `boolean`. Including the flag --constant-learning-rate will use the ClippedAdam\noptimizer instead of the OneCycleLR learning rate schedule, which is\nthe default. Learning is faster with the OneCycleLR schedule.\nHowever, training can easily be continued from a checkpoint for more\nepochs than the initial command specified when using ClippedAdam. On\nthe other hand, if using the OneCycleLR schedule with 150 epochs\nspecified, it is not possible to pick up from that final checkpoint\nand continue training until 250 epochs.\n"
|
||||
|
||||
}
|
||||
|
||||
|
||||
,
|
||||
"debug": {
|
||||
"type":
|
||||
"boolean",
|
||||
"description": "Type: `boolean_true`, default: `false`. Including the flag --debug will log extra messages useful for debugging",
|
||||
"help_text": "Type: `boolean_true`, default: `false`. Including the flag --debug will log extra messages useful for debugging.\n"
|
||||
,
|
||||
"default":false
|
||||
}
|
||||
|
||||
|
||||
,
|
||||
"cuda": {
|
||||
"type":
|
||||
"boolean",
|
||||
"description": "Type: `boolean_true`, default: `false`. Including the flag --cuda will run the inference on a\nGPU",
|
||||
"help_text": "Type: `boolean_true`, default: `false`. Including the flag --cuda will run the inference on a\nGPU.\n"
|
||||
,
|
||||
"default":false
|
||||
}
|
||||
|
||||
|
||||
}
|
||||
},
|
||||
|
||||
|
||||
"nextflow input-output arguments" : {
|
||||
"title": "Nextflow input-output arguments",
|
||||
"type": "object",
|
||||
"description": "Input/output parameters for Nextflow itself. Please note that both publishDir and publish_dir are supported but at least one has to be configured.",
|
||||
"properties": {
|
||||
|
||||
|
||||
"publish_dir": {
|
||||
"type":
|
||||
"string",
|
||||
"description": "Type: `string`, required, example: `output/`. Path to an output directory",
|
||||
"help_text": "Type: `string`, required, example: `output/`. Path to an output directory."
|
||||
|
||||
}
|
||||
|
||||
|
||||
|
||||
|
||||
}
|
||||
}
|
||||
},
|
||||
"allOf": [
|
||||
|
||||
{
|
||||
"$ref": "#/definitions/inputs"
|
||||
},
|
||||
|
||||
{
|
||||
"$ref": "#/definitions/outputs"
|
||||
},
|
||||
|
||||
{
|
||||
"$ref": "#/definitions/arguments"
|
||||
},
|
||||
|
||||
{
|
||||
"$ref": "#/definitions/nextflow input-output arguments"
|
||||
}
|
||||
]
|
||||
}
|
||||
@@ -0,0 +1,12 @@
|
||||
def setup_logger():
|
||||
import logging
|
||||
from sys import stdout
|
||||
|
||||
logger = logging.getLogger()
|
||||
logger.setLevel(logging.INFO)
|
||||
console_handler = logging.StreamHandler(stdout)
|
||||
logFormatter = logging.Formatter("%(asctime)s %(levelname)-8s %(message)s")
|
||||
console_handler.setFormatter(logFormatter)
|
||||
logger.addHandler(console_handler)
|
||||
|
||||
return logger
|
||||
@@ -0,0 +1,262 @@
|
||||
name: "add_id"
|
||||
namespace: "metadata"
|
||||
version: "2.1.2"
|
||||
authors:
|
||||
- name: "Dries Schaumont"
|
||||
roles:
|
||||
- "maintainer"
|
||||
info:
|
||||
role: "Core Team Member"
|
||||
links:
|
||||
email: "dries@data-intuitive.com"
|
||||
github: "DriesSchaumont"
|
||||
orcid: "0000-0002-4389-0440"
|
||||
linkedin: "dries-schaumont"
|
||||
organizations:
|
||||
- name: "Data Intuitive"
|
||||
href: "https://www.data-intuitive.com"
|
||||
role: "Data Scientist"
|
||||
argument_groups:
|
||||
- name: "Arguments"
|
||||
arguments:
|
||||
- type: "file"
|
||||
name: "--input"
|
||||
alternatives:
|
||||
- "-i"
|
||||
description: "Path to the input .h5mu."
|
||||
info: null
|
||||
example:
|
||||
- "sample_path"
|
||||
must_exist: true
|
||||
create_parent: true
|
||||
required: true
|
||||
direction: "input"
|
||||
multiple: false
|
||||
multiple_sep: ";"
|
||||
- type: "string"
|
||||
name: "--input_id"
|
||||
description: "The input id."
|
||||
info: null
|
||||
required: true
|
||||
direction: "input"
|
||||
multiple: false
|
||||
multiple_sep: ";"
|
||||
- type: "string"
|
||||
name: "--obs_output"
|
||||
description: "Name of the .obs column where to store the id."
|
||||
info: null
|
||||
default:
|
||||
- "sample_id"
|
||||
required: false
|
||||
direction: "input"
|
||||
multiple: false
|
||||
multiple_sep: ";"
|
||||
- type: "file"
|
||||
name: "--output"
|
||||
alternatives:
|
||||
- "-o"
|
||||
info: null
|
||||
example:
|
||||
- "output.h5mu"
|
||||
must_exist: true
|
||||
create_parent: true
|
||||
required: false
|
||||
direction: "output"
|
||||
multiple: false
|
||||
multiple_sep: ";"
|
||||
- type: "string"
|
||||
name: "--output_compression"
|
||||
description: "The compression format to be used on the output h5mu object."
|
||||
info: null
|
||||
example:
|
||||
- "gzip"
|
||||
required: false
|
||||
choices:
|
||||
- "gzip"
|
||||
- "lzf"
|
||||
direction: "input"
|
||||
multiple: false
|
||||
multiple_sep: ";"
|
||||
- type: "boolean_true"
|
||||
name: "--make_observation_keys_unique"
|
||||
description: "Join the id to the .obs index (.obs_names)."
|
||||
info: null
|
||||
direction: "input"
|
||||
resources:
|
||||
- type: "python_script"
|
||||
path: "script.py"
|
||||
is_executable: true
|
||||
- type: "file"
|
||||
path: "setup_logger.py"
|
||||
- type: "file"
|
||||
path: "nextflow_labels.config"
|
||||
dest: "nextflow_labels.config"
|
||||
description: "Add id of .obs. Also allows to make .obs_names (the .obs index) unique\
|
||||
\ \nby prefixing the values with an unique id per .h5mu file.\n"
|
||||
test_resources:
|
||||
- type: "python_script"
|
||||
path: "test.py"
|
||||
is_executable: true
|
||||
- type: "file"
|
||||
path: "e18_mouse_brain_fresh_5k_filtered_feature_bc_matrix_subset_unique_obs.h5mu"
|
||||
info: null
|
||||
status: "enabled"
|
||||
scope:
|
||||
image: "public"
|
||||
target: "public"
|
||||
license: "MIT"
|
||||
links:
|
||||
repository: "https://github.com/openpipelines-bio/openpipeline"
|
||||
docker_registry: "ghcr.io"
|
||||
runners:
|
||||
- type: "executable"
|
||||
id: "executable"
|
||||
docker_setup_strategy: "ifneedbepullelsecachedbuild"
|
||||
- type: "nextflow"
|
||||
id: "nextflow"
|
||||
directives:
|
||||
label:
|
||||
- "singlecpu"
|
||||
- "lowmem"
|
||||
tag: "$id"
|
||||
auto:
|
||||
simplifyInput: true
|
||||
simplifyOutput: false
|
||||
transcript: false
|
||||
publish: false
|
||||
config:
|
||||
labels:
|
||||
mem1gb: "memory = 1000000000.B"
|
||||
mem2gb: "memory = 2000000000.B"
|
||||
mem5gb: "memory = 5000000000.B"
|
||||
mem10gb: "memory = 10000000000.B"
|
||||
mem20gb: "memory = 20000000000.B"
|
||||
mem50gb: "memory = 50000000000.B"
|
||||
mem100gb: "memory = 100000000000.B"
|
||||
mem200gb: "memory = 200000000000.B"
|
||||
mem500gb: "memory = 500000000000.B"
|
||||
mem1tb: "memory = 1000000000000.B"
|
||||
mem2tb: "memory = 2000000000000.B"
|
||||
mem5tb: "memory = 5000000000000.B"
|
||||
mem10tb: "memory = 10000000000000.B"
|
||||
mem20tb: "memory = 20000000000000.B"
|
||||
mem50tb: "memory = 50000000000000.B"
|
||||
mem100tb: "memory = 100000000000000.B"
|
||||
mem200tb: "memory = 200000000000000.B"
|
||||
mem500tb: "memory = 500000000000000.B"
|
||||
mem1gib: "memory = 1073741824.B"
|
||||
mem2gib: "memory = 2147483648.B"
|
||||
mem4gib: "memory = 4294967296.B"
|
||||
mem8gib: "memory = 8589934592.B"
|
||||
mem16gib: "memory = 17179869184.B"
|
||||
mem32gib: "memory = 34359738368.B"
|
||||
mem64gib: "memory = 68719476736.B"
|
||||
mem128gib: "memory = 137438953472.B"
|
||||
mem256gib: "memory = 274877906944.B"
|
||||
mem512gib: "memory = 549755813888.B"
|
||||
mem1tib: "memory = 1099511627776.B"
|
||||
mem2tib: "memory = 2199023255552.B"
|
||||
mem4tib: "memory = 4398046511104.B"
|
||||
mem8tib: "memory = 8796093022208.B"
|
||||
mem16tib: "memory = 17592186044416.B"
|
||||
mem32tib: "memory = 35184372088832.B"
|
||||
mem64tib: "memory = 70368744177664.B"
|
||||
mem128tib: "memory = 140737488355328.B"
|
||||
mem256tib: "memory = 281474976710656.B"
|
||||
mem512tib: "memory = 562949953421312.B"
|
||||
cpu1: "cpus = 1"
|
||||
cpu2: "cpus = 2"
|
||||
cpu5: "cpus = 5"
|
||||
cpu10: "cpus = 10"
|
||||
cpu20: "cpus = 20"
|
||||
cpu50: "cpus = 50"
|
||||
cpu100: "cpus = 100"
|
||||
cpu200: "cpus = 200"
|
||||
cpu500: "cpus = 500"
|
||||
cpu1000: "cpus = 1000"
|
||||
script:
|
||||
- "includeConfig(\"nextflow_labels.config\")"
|
||||
debug: false
|
||||
container: "docker"
|
||||
engines:
|
||||
- type: "docker"
|
||||
id: "docker"
|
||||
image: "python:3.11-slim"
|
||||
target_tag: "2.1.0"
|
||||
namespace_separator: "/"
|
||||
setup:
|
||||
- type: "apt"
|
||||
packages:
|
||||
- "procps"
|
||||
interactive: false
|
||||
- type: "python"
|
||||
user: false
|
||||
packages:
|
||||
- "anndata~=0.11.1"
|
||||
- "mudata~=0.3.1"
|
||||
script:
|
||||
- "exec(\"try:\\n import awkward\\nexcept ModuleNotFoundError:\\n exit(0)\\\
|
||||
nelse: exit(1)\")"
|
||||
upgrade: true
|
||||
test_setup:
|
||||
- type: "apt"
|
||||
packages:
|
||||
- "git"
|
||||
interactive: false
|
||||
- type: "python"
|
||||
user: false
|
||||
packages:
|
||||
- "viashpy==0.8.0"
|
||||
github:
|
||||
- "openpipelines-bio/core#subdirectory=packages/python/openpipeline_testutils"
|
||||
upgrade: true
|
||||
entrypoint: []
|
||||
cmd: null
|
||||
build_info:
|
||||
config: "src/metadata/add_id/config.vsh.yaml"
|
||||
runner: "nextflow"
|
||||
engine: "docker"
|
||||
output: "target/nextflow/metadata/add_id"
|
||||
executable: "target/nextflow/metadata/add_id/main.nf"
|
||||
viash_version: "0.9.4"
|
||||
git_commit: "a0c9522486585774f76416150f8a3291409b5363"
|
||||
git_remote: "https://github.com/openpipelines-bio/openpipeline"
|
||||
git_tag: "2.1.1-2-ga0c95224865"
|
||||
package_config:
|
||||
name: "openpipeline"
|
||||
summary: "Best-practice workflows for single-cell multi-omics analyses.\n"
|
||||
description: "OpenPipelines are extensible single cell analysis pipelines for reproducible\
|
||||
\ and large-scale single cell processing using [Viash](https://viash.io) and [Nextflow](https://www.nextflow.io/).\n\
|
||||
\nIn terms of workflows, the following has been made available, but keep in mind\
|
||||
\ that\nindividual tools and functionality can be executed as standalone components\
|
||||
\ as well.\n\n * Demultiplexing: conversion of raw sequencing data to FASTQ objects.\n\
|
||||
\ * Ingestion: Read mapping and generating a count matrix.\n * Single sample\
|
||||
\ processing: cell filtering and doublet detection.\n * Multisample processing:\
|
||||
\ Count transformation, normalization, QC metric calulations.\n * Integration:\
|
||||
\ Clustering, integration and batch correction using single and multimodal methods.\n\
|
||||
\ * Downstream analysis workflows\n"
|
||||
info:
|
||||
test_resources:
|
||||
- type: "s3"
|
||||
path: "s3://openpipelines-data"
|
||||
dest: "resources_test"
|
||||
viash_version: "0.9.4"
|
||||
source: "src"
|
||||
target: "target"
|
||||
config_mods:
|
||||
- ".resources += {path: '/src/workflows/utils/labels.config', dest: 'nextflow_labels.config'}\n\
|
||||
.runners[.type == 'nextflow'].config.script := 'includeConfig(\"nextflow_labels.config\"\
|
||||
)'"
|
||||
- ".version := \"2.1.2\""
|
||||
- ".engines[.type == 'docker'].target_tag := '2.1.0'"
|
||||
keywords:
|
||||
- "single-cell"
|
||||
- "multimodal"
|
||||
license: "MIT"
|
||||
organization: "openpipelines-bio"
|
||||
links:
|
||||
repository: "https://github.com/openpipelines-bio/openpipeline"
|
||||
docker_registry: "ghcr.io"
|
||||
homepage: "https://openpipelines.bio"
|
||||
documentation: "https://openpipelines.bio/fundamentals"
|
||||
issue_tracker: "https://github.com/openpipelines-bio/openpipeline/issues"
|
||||
File diff suppressed because it is too large
Load Diff
@@ -0,0 +1,126 @@
|
||||
manifest {
|
||||
name = 'metadata/add_id'
|
||||
mainScript = 'main.nf'
|
||||
nextflowVersion = '!>=20.12.1-edge'
|
||||
version = '2.1.2'
|
||||
description = 'Add id of .obs. Also allows to make .obs_names (the .obs index) unique \nby prefixing the values with an unique id per .h5mu file.\n'
|
||||
author = 'Dries Schaumont'
|
||||
}
|
||||
|
||||
process.container = 'nextflow/bash:latest'
|
||||
|
||||
// detect tempdir
|
||||
tempDir = java.nio.file.Paths.get(
|
||||
System.getenv('NXF_TEMP') ?:
|
||||
System.getenv('VIASH_TEMP') ?:
|
||||
System.getenv('TEMPDIR') ?:
|
||||
System.getenv('TMPDIR') ?:
|
||||
'/tmp'
|
||||
).toAbsolutePath()
|
||||
|
||||
profiles {
|
||||
no_publish {
|
||||
process {
|
||||
withName: '.*' {
|
||||
publishDir = [
|
||||
enabled: false
|
||||
]
|
||||
}
|
||||
}
|
||||
}
|
||||
mount_temp {
|
||||
docker.temp = tempDir
|
||||
podman.temp = tempDir
|
||||
charliecloud.temp = tempDir
|
||||
}
|
||||
docker {
|
||||
docker.enabled = true
|
||||
// docker.userEmulation = true
|
||||
singularity.enabled = false
|
||||
podman.enabled = false
|
||||
shifter.enabled = false
|
||||
charliecloud.enabled = false
|
||||
}
|
||||
singularity {
|
||||
singularity.enabled = true
|
||||
singularity.autoMounts = true
|
||||
docker.enabled = false
|
||||
podman.enabled = false
|
||||
shifter.enabled = false
|
||||
charliecloud.enabled = false
|
||||
}
|
||||
podman {
|
||||
podman.enabled = true
|
||||
docker.enabled = false
|
||||
singularity.enabled = false
|
||||
shifter.enabled = false
|
||||
charliecloud.enabled = false
|
||||
}
|
||||
shifter {
|
||||
shifter.enabled = true
|
||||
docker.enabled = false
|
||||
singularity.enabled = false
|
||||
podman.enabled = false
|
||||
charliecloud.enabled = false
|
||||
}
|
||||
charliecloud {
|
||||
charliecloud.enabled = true
|
||||
docker.enabled = false
|
||||
singularity.enabled = false
|
||||
podman.enabled = false
|
||||
shifter.enabled = false
|
||||
}
|
||||
}
|
||||
|
||||
process{
|
||||
withLabel: mem1gb { memory = 1000000000.B }
|
||||
withLabel: mem2gb { memory = 2000000000.B }
|
||||
withLabel: mem5gb { memory = 5000000000.B }
|
||||
withLabel: mem10gb { memory = 10000000000.B }
|
||||
withLabel: mem20gb { memory = 20000000000.B }
|
||||
withLabel: mem50gb { memory = 50000000000.B }
|
||||
withLabel: mem100gb { memory = 100000000000.B }
|
||||
withLabel: mem200gb { memory = 200000000000.B }
|
||||
withLabel: mem500gb { memory = 500000000000.B }
|
||||
withLabel: mem1tb { memory = 1000000000000.B }
|
||||
withLabel: mem2tb { memory = 2000000000000.B }
|
||||
withLabel: mem5tb { memory = 5000000000000.B }
|
||||
withLabel: mem10tb { memory = 10000000000000.B }
|
||||
withLabel: mem20tb { memory = 20000000000000.B }
|
||||
withLabel: mem50tb { memory = 50000000000000.B }
|
||||
withLabel: mem100tb { memory = 100000000000000.B }
|
||||
withLabel: mem200tb { memory = 200000000000000.B }
|
||||
withLabel: mem500tb { memory = 500000000000000.B }
|
||||
withLabel: mem1gib { memory = 1073741824.B }
|
||||
withLabel: mem2gib { memory = 2147483648.B }
|
||||
withLabel: mem4gib { memory = 4294967296.B }
|
||||
withLabel: mem8gib { memory = 8589934592.B }
|
||||
withLabel: mem16gib { memory = 17179869184.B }
|
||||
withLabel: mem32gib { memory = 34359738368.B }
|
||||
withLabel: mem64gib { memory = 68719476736.B }
|
||||
withLabel: mem128gib { memory = 137438953472.B }
|
||||
withLabel: mem256gib { memory = 274877906944.B }
|
||||
withLabel: mem512gib { memory = 549755813888.B }
|
||||
withLabel: mem1tib { memory = 1099511627776.B }
|
||||
withLabel: mem2tib { memory = 2199023255552.B }
|
||||
withLabel: mem4tib { memory = 4398046511104.B }
|
||||
withLabel: mem8tib { memory = 8796093022208.B }
|
||||
withLabel: mem16tib { memory = 17592186044416.B }
|
||||
withLabel: mem32tib { memory = 35184372088832.B }
|
||||
withLabel: mem64tib { memory = 70368744177664.B }
|
||||
withLabel: mem128tib { memory = 140737488355328.B }
|
||||
withLabel: mem256tib { memory = 281474976710656.B }
|
||||
withLabel: mem512tib { memory = 562949953421312.B }
|
||||
withLabel: cpu1 { cpus = 1 }
|
||||
withLabel: cpu2 { cpus = 2 }
|
||||
withLabel: cpu5 { cpus = 5 }
|
||||
withLabel: cpu10 { cpus = 10 }
|
||||
withLabel: cpu20 { cpus = 20 }
|
||||
withLabel: cpu50 { cpus = 50 }
|
||||
withLabel: cpu100 { cpus = 100 }
|
||||
withLabel: cpu200 { cpus = 200 }
|
||||
withLabel: cpu500 { cpus = 500 }
|
||||
withLabel: cpu1000 { cpus = 1000 }
|
||||
}
|
||||
|
||||
includeConfig("nextflow_labels.config")
|
||||
@@ -0,0 +1,66 @@
|
||||
process {
|
||||
// Default resources for components that hardly do any processing
|
||||
memory = { 2.GB * task.attempt }
|
||||
cpus = 1
|
||||
|
||||
// Retry for exit codes that have something to do with memory issues
|
||||
errorStrategy = { task.exitStatus in 137..140 ? 'retry' : 'terminate' }
|
||||
maxRetries = 3
|
||||
maxMemory = null
|
||||
|
||||
// CPU resources
|
||||
withLabel: singlecpu { cpus = 1 }
|
||||
withLabel: lowcpu { cpus = 4 }
|
||||
withLabel: midcpu { cpus = 10 }
|
||||
withLabel: highcpu { cpus = 20 }
|
||||
|
||||
// Memory resources
|
||||
withLabel: lowmem { memory = { get_memory( 4.GB * task.attempt ) } }
|
||||
withLabel: midmem { memory = { get_memory( 25.GB * task.attempt ) } }
|
||||
withLabel: highmem { memory = { get_memory( 50.GB * task.attempt ) } }
|
||||
withLabel: veryhighmem { memory = { get_memory( 75.GB * task.attempt ) } }
|
||||
|
||||
// Disk space
|
||||
withLabel: lowdisk {
|
||||
disk = {process.disk ? process.disk : null}
|
||||
}
|
||||
withLabel: middisk {
|
||||
disk = {process.disk ? process.disk : null}
|
||||
}
|
||||
withLabel: highdisk {
|
||||
disk = {process.disk ? process.disk : null}
|
||||
}
|
||||
withLabel: veryhighdisk {
|
||||
disk = {process.disk ? process.disk : null}
|
||||
}
|
||||
|
||||
// NOTE: The above labels intentionally do not have an effect by default.
|
||||
// The user should set the disk space requirements by adding the following
|
||||
// to the compute environment:
|
||||
//
|
||||
// withLabel: lowdisk { disk = { 20.GB * task.attempt } }
|
||||
// withLabel: middisk { disk = { 100.GB * task.attempt } }
|
||||
// withLabel: highdisk { disk = { 200.GB * task.attempt } }
|
||||
// withLabel: veryhighdisk { disk = { 500.GB * task.attempt } }
|
||||
}
|
||||
|
||||
def get_memory(to_compare) {
|
||||
if (!process.containsKey("maxMemory") || !process.maxMemory) {
|
||||
return to_compare
|
||||
}
|
||||
|
||||
try {
|
||||
if (process.containsKey("maxRetries") && process.maxRetries && task.attempt == (process.maxRetries as int)) {
|
||||
return process.maxMemory
|
||||
}
|
||||
else if (to_compare.compareTo(process.maxMemory as nextflow.util.MemoryUnit) == 1) {
|
||||
return max_memory as nextflow.util.MemoryUnit
|
||||
}
|
||||
else {
|
||||
return to_compare
|
||||
}
|
||||
} catch (all) {
|
||||
println "Error processing memory resources. Please check that process.maxMemory '${process.maxMemory}' and process.maxRetries '${process.maxRetries}' are valid!"
|
||||
System.exit(1)
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,11 @@
|
||||
# Arguments
|
||||
input: # please fill in - example: "sample_path"
|
||||
input_id: # please fill in - example: "foo"
|
||||
obs_output: "sample_id"
|
||||
# output: "$id.$key.output.h5mu"
|
||||
# output_compression: "gzip"
|
||||
make_observation_keys_unique: false
|
||||
|
||||
# Nextflow input-output arguments
|
||||
publish_dir: # please fill in - example: "output/"
|
||||
# param_list: "my_params.yaml"
|
||||
@@ -0,0 +1,132 @@
|
||||
{
|
||||
"$schema": "http://json-schema.org/draft-07/schema",
|
||||
"title": "add_id",
|
||||
"description": "Add id of .obs. Also allows to make .obs_names (the .obs index) unique \nby prefixing the values with an unique id per .h5mu file.\n",
|
||||
"type": "object",
|
||||
"definitions": {
|
||||
|
||||
|
||||
"Dataset input": {
|
||||
"title": "Dataset input",
|
||||
"type": "object",
|
||||
"description": "Dataset input using nf-tower \"dataset\" or \"data explorer\". Allows for the input of multiple parameter sets to initialise a Nextflow channel.",
|
||||
"properties": {
|
||||
"param_list": {
|
||||
"description": "Dataset input can either be a list of maps, a csv file, a json file, a yaml file, or simply a yaml blob. The names of the input fields (e.g. csv columns, json keys) need to be an exact match with the workflow input parameters.",
|
||||
"default": "",
|
||||
"format": "file-path",
|
||||
"mimetype": "text/csv",
|
||||
"pattern": "^\\S+\\.csv$"
|
||||
}
|
||||
}
|
||||
},
|
||||
|
||||
|
||||
|
||||
"arguments" : {
|
||||
"title": "Arguments",
|
||||
"type": "object",
|
||||
"description": "No description",
|
||||
"properties": {
|
||||
|
||||
|
||||
"input": {
|
||||
"type":
|
||||
"string",
|
||||
"description": "Type: `file`, required, example: `sample_path`. Path to the input ",
|
||||
"help_text": "Type: `file`, required, example: `sample_path`. Path to the input .h5mu."
|
||||
|
||||
}
|
||||
|
||||
|
||||
,
|
||||
"input_id": {
|
||||
"type":
|
||||
"string",
|
||||
"description": "Type: `string`, required. The input id",
|
||||
"help_text": "Type: `string`, required. The input id."
|
||||
|
||||
}
|
||||
|
||||
|
||||
,
|
||||
"obs_output": {
|
||||
"type":
|
||||
"string",
|
||||
"description": "Type: `string`, default: `sample_id`. Name of the ",
|
||||
"help_text": "Type: `string`, default: `sample_id`. Name of the .obs column where to store the id."
|
||||
,
|
||||
"default":"sample_id"
|
||||
}
|
||||
|
||||
|
||||
,
|
||||
"output": {
|
||||
"type":
|
||||
"string",
|
||||
"description": "Type: `file`, default: `$id.$key.output.h5mu`, example: `output.h5mu`. ",
|
||||
"help_text": "Type: `file`, default: `$id.$key.output.h5mu`, example: `output.h5mu`. "
|
||||
,
|
||||
"default":"$id.$key.output.h5mu"
|
||||
}
|
||||
|
||||
|
||||
,
|
||||
"output_compression": {
|
||||
"type":
|
||||
"string",
|
||||
"description": "Type: `string`, example: `gzip`, choices: ``gzip`, `lzf``. The compression format to be used on the output h5mu object",
|
||||
"help_text": "Type: `string`, example: `gzip`, choices: ``gzip`, `lzf``. The compression format to be used on the output h5mu object.",
|
||||
"enum": ["gzip", "lzf"]
|
||||
|
||||
|
||||
}
|
||||
|
||||
|
||||
,
|
||||
"make_observation_keys_unique": {
|
||||
"type":
|
||||
"boolean",
|
||||
"description": "Type: `boolean_true`, default: `false`. Join the id to the ",
|
||||
"help_text": "Type: `boolean_true`, default: `false`. Join the id to the .obs index (.obs_names)."
|
||||
,
|
||||
"default":false
|
||||
}
|
||||
|
||||
|
||||
}
|
||||
},
|
||||
|
||||
|
||||
"nextflow input-output arguments" : {
|
||||
"title": "Nextflow input-output arguments",
|
||||
"type": "object",
|
||||
"description": "Input/output parameters for Nextflow itself. Please note that both publishDir and publish_dir are supported but at least one has to be configured.",
|
||||
"properties": {
|
||||
|
||||
|
||||
"publish_dir": {
|
||||
"type":
|
||||
"string",
|
||||
"description": "Type: `string`, required, example: `output/`. Path to an output directory",
|
||||
"help_text": "Type: `string`, required, example: `output/`. Path to an output directory."
|
||||
|
||||
}
|
||||
|
||||
|
||||
|
||||
|
||||
}
|
||||
}
|
||||
},
|
||||
"allOf": [
|
||||
|
||||
{
|
||||
"$ref": "#/definitions/arguments"
|
||||
},
|
||||
|
||||
{
|
||||
"$ref": "#/definitions/nextflow input-output arguments"
|
||||
}
|
||||
]
|
||||
}
|
||||
@@ -0,0 +1,12 @@
|
||||
def setup_logger():
|
||||
import logging
|
||||
from sys import stdout
|
||||
|
||||
logger = logging.getLogger()
|
||||
logger.setLevel(logging.INFO)
|
||||
console_handler = logging.StreamHandler(stdout)
|
||||
logFormatter = logging.Formatter("%(asctime)s %(levelname)-8s %(message)s")
|
||||
console_handler.setFormatter(logFormatter)
|
||||
logger.addHandler(console_handler)
|
||||
|
||||
return logger
|
||||
@@ -0,0 +1,318 @@
|
||||
name: "grep_annotation_column"
|
||||
namespace: "metadata"
|
||||
version: "2.1.2"
|
||||
authors:
|
||||
- name: "Dries Schaumont"
|
||||
roles:
|
||||
- "maintainer"
|
||||
info:
|
||||
role: "Core Team Member"
|
||||
links:
|
||||
email: "dries@data-intuitive.com"
|
||||
github: "DriesSchaumont"
|
||||
orcid: "0000-0002-4389-0440"
|
||||
linkedin: "dries-schaumont"
|
||||
organizations:
|
||||
- name: "Data Intuitive"
|
||||
href: "https://www.data-intuitive.com"
|
||||
role: "Data Scientist"
|
||||
argument_groups:
|
||||
- name: "Inputs"
|
||||
description: "Arguments related to the input dataset."
|
||||
arguments:
|
||||
- type: "file"
|
||||
name: "--input"
|
||||
alternatives:
|
||||
- "-i"
|
||||
description: "Path to the input .h5mu."
|
||||
info: null
|
||||
example:
|
||||
- "sample_path"
|
||||
must_exist: true
|
||||
create_parent: true
|
||||
required: true
|
||||
direction: "input"
|
||||
multiple: false
|
||||
multiple_sep: ";"
|
||||
- type: "string"
|
||||
name: "--input_column"
|
||||
description: "Column to query. If not specified, use .var_names or .obs_names,\
|
||||
\ depending on the value of --matrix"
|
||||
info: null
|
||||
required: false
|
||||
direction: "input"
|
||||
multiple: false
|
||||
multiple_sep: ";"
|
||||
- type: "string"
|
||||
name: "--input_layer"
|
||||
description: "Input data to use when calculating fraction of observations that\
|
||||
\ match with the query. \nOnly used when --output_fraction_column is provided.\
|
||||
\ If not specified, .X is used.\n"
|
||||
info: null
|
||||
required: false
|
||||
direction: "input"
|
||||
multiple: false
|
||||
multiple_sep: ";"
|
||||
- type: "string"
|
||||
name: "--modality"
|
||||
description: "Which modality to get the annotation matrix from.\n"
|
||||
info: null
|
||||
example:
|
||||
- "rna"
|
||||
required: true
|
||||
direction: "input"
|
||||
multiple: false
|
||||
multiple_sep: ";"
|
||||
- type: "string"
|
||||
name: "--matrix"
|
||||
description: "Matrix to fetch the column from that will be searched."
|
||||
info: null
|
||||
example:
|
||||
- "var"
|
||||
required: false
|
||||
choices:
|
||||
- "var"
|
||||
- "obs"
|
||||
direction: "input"
|
||||
multiple: false
|
||||
multiple_sep: ";"
|
||||
- name: "Outputs"
|
||||
description: "Arguments related to how the output will be written."
|
||||
arguments:
|
||||
- type: "file"
|
||||
name: "--output"
|
||||
alternatives:
|
||||
- "-o"
|
||||
info: null
|
||||
example:
|
||||
- "output.h5mu"
|
||||
must_exist: true
|
||||
create_parent: true
|
||||
required: false
|
||||
direction: "output"
|
||||
multiple: false
|
||||
multiple_sep: ";"
|
||||
- type: "string"
|
||||
name: "--output_compression"
|
||||
description: "The compression format to be used on the output h5mu object."
|
||||
info: null
|
||||
example:
|
||||
- "gzip"
|
||||
required: false
|
||||
choices:
|
||||
- "gzip"
|
||||
- "lzf"
|
||||
direction: "input"
|
||||
multiple: false
|
||||
multiple_sep: ";"
|
||||
- type: "string"
|
||||
name: "--output_match_column"
|
||||
description: "Name of the column to write the result to."
|
||||
info: null
|
||||
required: true
|
||||
direction: "input"
|
||||
multiple: false
|
||||
multiple_sep: ";"
|
||||
- type: "string"
|
||||
name: "--output_fraction_column"
|
||||
description: "For the opposite axis, name of the column to write the fraction\
|
||||
\ of \nobservations that matches to the pattern.\n"
|
||||
info: null
|
||||
required: false
|
||||
direction: "input"
|
||||
multiple: false
|
||||
multiple_sep: ";"
|
||||
- name: "Query options"
|
||||
description: "Options related to the query"
|
||||
arguments:
|
||||
- type: "string"
|
||||
name: "--regex_pattern"
|
||||
description: "Regex to use to match with the input column."
|
||||
info: null
|
||||
example:
|
||||
- "^[mM][tT]-"
|
||||
required: true
|
||||
direction: "input"
|
||||
multiple: false
|
||||
multiple_sep: ";"
|
||||
resources:
|
||||
- type: "python_script"
|
||||
path: "script.py"
|
||||
is_executable: true
|
||||
- type: "file"
|
||||
path: "setup_logger.py"
|
||||
- type: "file"
|
||||
path: "compress_h5mu.py"
|
||||
- type: "file"
|
||||
path: "nextflow_labels.config"
|
||||
dest: "nextflow_labels.config"
|
||||
description: "Perform a regex lookup on a column from the annotation matrices .obs\
|
||||
\ or .var.\nThe annotation matrix can originate from either a modality, or all modalities\
|
||||
\ (global .var or .obs).\n"
|
||||
test_resources:
|
||||
- type: "python_script"
|
||||
path: "test.py"
|
||||
is_executable: true
|
||||
- type: "file"
|
||||
path: "e18_mouse_brain_fresh_5k_filtered_feature_bc_matrix_subset_unique_obs.h5mu"
|
||||
info: null
|
||||
status: "enabled"
|
||||
scope:
|
||||
image: "public"
|
||||
target: "public"
|
||||
license: "MIT"
|
||||
links:
|
||||
repository: "https://github.com/openpipelines-bio/openpipeline"
|
||||
docker_registry: "ghcr.io"
|
||||
runners:
|
||||
- type: "executable"
|
||||
id: "executable"
|
||||
docker_setup_strategy: "ifneedbepullelsecachedbuild"
|
||||
- type: "nextflow"
|
||||
id: "nextflow"
|
||||
directives:
|
||||
label:
|
||||
- "singlecpu"
|
||||
- "lowmem"
|
||||
tag: "$id"
|
||||
auto:
|
||||
simplifyInput: true
|
||||
simplifyOutput: false
|
||||
transcript: false
|
||||
publish: false
|
||||
config:
|
||||
labels:
|
||||
mem1gb: "memory = 1000000000.B"
|
||||
mem2gb: "memory = 2000000000.B"
|
||||
mem5gb: "memory = 5000000000.B"
|
||||
mem10gb: "memory = 10000000000.B"
|
||||
mem20gb: "memory = 20000000000.B"
|
||||
mem50gb: "memory = 50000000000.B"
|
||||
mem100gb: "memory = 100000000000.B"
|
||||
mem200gb: "memory = 200000000000.B"
|
||||
mem500gb: "memory = 500000000000.B"
|
||||
mem1tb: "memory = 1000000000000.B"
|
||||
mem2tb: "memory = 2000000000000.B"
|
||||
mem5tb: "memory = 5000000000000.B"
|
||||
mem10tb: "memory = 10000000000000.B"
|
||||
mem20tb: "memory = 20000000000000.B"
|
||||
mem50tb: "memory = 50000000000000.B"
|
||||
mem100tb: "memory = 100000000000000.B"
|
||||
mem200tb: "memory = 200000000000000.B"
|
||||
mem500tb: "memory = 500000000000000.B"
|
||||
mem1gib: "memory = 1073741824.B"
|
||||
mem2gib: "memory = 2147483648.B"
|
||||
mem4gib: "memory = 4294967296.B"
|
||||
mem8gib: "memory = 8589934592.B"
|
||||
mem16gib: "memory = 17179869184.B"
|
||||
mem32gib: "memory = 34359738368.B"
|
||||
mem64gib: "memory = 68719476736.B"
|
||||
mem128gib: "memory = 137438953472.B"
|
||||
mem256gib: "memory = 274877906944.B"
|
||||
mem512gib: "memory = 549755813888.B"
|
||||
mem1tib: "memory = 1099511627776.B"
|
||||
mem2tib: "memory = 2199023255552.B"
|
||||
mem4tib: "memory = 4398046511104.B"
|
||||
mem8tib: "memory = 8796093022208.B"
|
||||
mem16tib: "memory = 17592186044416.B"
|
||||
mem32tib: "memory = 35184372088832.B"
|
||||
mem64tib: "memory = 70368744177664.B"
|
||||
mem128tib: "memory = 140737488355328.B"
|
||||
mem256tib: "memory = 281474976710656.B"
|
||||
mem512tib: "memory = 562949953421312.B"
|
||||
cpu1: "cpus = 1"
|
||||
cpu2: "cpus = 2"
|
||||
cpu5: "cpus = 5"
|
||||
cpu10: "cpus = 10"
|
||||
cpu20: "cpus = 20"
|
||||
cpu50: "cpus = 50"
|
||||
cpu100: "cpus = 100"
|
||||
cpu200: "cpus = 200"
|
||||
cpu500: "cpus = 500"
|
||||
cpu1000: "cpus = 1000"
|
||||
script:
|
||||
- "includeConfig(\"nextflow_labels.config\")"
|
||||
debug: false
|
||||
container: "docker"
|
||||
engines:
|
||||
- type: "docker"
|
||||
id: "docker"
|
||||
image: "python:3.11-slim"
|
||||
target_tag: "2.1.0"
|
||||
namespace_separator: "/"
|
||||
setup:
|
||||
- type: "apt"
|
||||
packages:
|
||||
- "procps"
|
||||
interactive: false
|
||||
- type: "python"
|
||||
user: false
|
||||
packages:
|
||||
- "anndata~=0.11.1"
|
||||
- "mudata~=0.3.1"
|
||||
script:
|
||||
- "exec(\"try:\\n import awkward\\nexcept ModuleNotFoundError:\\n exit(0)\\\
|
||||
nelse: exit(1)\")"
|
||||
upgrade: true
|
||||
test_setup:
|
||||
- type: "apt"
|
||||
packages:
|
||||
- "git"
|
||||
interactive: false
|
||||
- type: "python"
|
||||
user: false
|
||||
packages:
|
||||
- "viashpy==0.8.0"
|
||||
github:
|
||||
- "openpipelines-bio/core#subdirectory=packages/python/openpipeline_testutils"
|
||||
upgrade: true
|
||||
entrypoint: []
|
||||
cmd: null
|
||||
build_info:
|
||||
config: "src/metadata/grep_annotation_column/config.vsh.yaml"
|
||||
runner: "nextflow"
|
||||
engine: "docker"
|
||||
output: "target/nextflow/metadata/grep_annotation_column"
|
||||
executable: "target/nextflow/metadata/grep_annotation_column/main.nf"
|
||||
viash_version: "0.9.4"
|
||||
git_commit: "a0c9522486585774f76416150f8a3291409b5363"
|
||||
git_remote: "https://github.com/openpipelines-bio/openpipeline"
|
||||
git_tag: "2.1.1-2-ga0c95224865"
|
||||
package_config:
|
||||
name: "openpipeline"
|
||||
summary: "Best-practice workflows for single-cell multi-omics analyses.\n"
|
||||
description: "OpenPipelines are extensible single cell analysis pipelines for reproducible\
|
||||
\ and large-scale single cell processing using [Viash](https://viash.io) and [Nextflow](https://www.nextflow.io/).\n\
|
||||
\nIn terms of workflows, the following has been made available, but keep in mind\
|
||||
\ that\nindividual tools and functionality can be executed as standalone components\
|
||||
\ as well.\n\n * Demultiplexing: conversion of raw sequencing data to FASTQ objects.\n\
|
||||
\ * Ingestion: Read mapping and generating a count matrix.\n * Single sample\
|
||||
\ processing: cell filtering and doublet detection.\n * Multisample processing:\
|
||||
\ Count transformation, normalization, QC metric calulations.\n * Integration:\
|
||||
\ Clustering, integration and batch correction using single and multimodal methods.\n\
|
||||
\ * Downstream analysis workflows\n"
|
||||
info:
|
||||
test_resources:
|
||||
- type: "s3"
|
||||
path: "s3://openpipelines-data"
|
||||
dest: "resources_test"
|
||||
viash_version: "0.9.4"
|
||||
source: "src"
|
||||
target: "target"
|
||||
config_mods:
|
||||
- ".resources += {path: '/src/workflows/utils/labels.config', dest: 'nextflow_labels.config'}\n\
|
||||
.runners[.type == 'nextflow'].config.script := 'includeConfig(\"nextflow_labels.config\"\
|
||||
)'"
|
||||
- ".version := \"2.1.2\""
|
||||
- ".engines[.type == 'docker'].target_tag := '2.1.0'"
|
||||
keywords:
|
||||
- "single-cell"
|
||||
- "multimodal"
|
||||
license: "MIT"
|
||||
organization: "openpipelines-bio"
|
||||
links:
|
||||
repository: "https://github.com/openpipelines-bio/openpipeline"
|
||||
docker_registry: "ghcr.io"
|
||||
homepage: "https://openpipelines.bio"
|
||||
documentation: "https://openpipelines.bio/fundamentals"
|
||||
issue_tracker: "https://github.com/openpipelines-bio/openpipeline/issues"
|
||||
@@ -0,0 +1,87 @@
|
||||
import shutil
|
||||
from anndata import AnnData
|
||||
from mudata import write_h5ad
|
||||
from h5py import File as H5File
|
||||
from h5py import Group, Dataset
|
||||
from pathlib import Path
|
||||
from typing import Union, Literal
|
||||
from functools import partial
|
||||
|
||||
|
||||
def compress_h5mu(
|
||||
input_path: Union[str, Path],
|
||||
output_path: Union[str, Path],
|
||||
compression: Union[Literal["gzip"], Literal["lzf"]],
|
||||
):
|
||||
input_path, output_path = str(input_path), str(output_path)
|
||||
|
||||
def copy_attributes(in_object, out_object):
|
||||
for key, value in in_object.attrs.items():
|
||||
out_object.attrs[key] = value
|
||||
|
||||
def visit_path(
|
||||
output_h5: H5File,
|
||||
compression: Union[Literal["gzip"], Literal["lzf"]],
|
||||
name: str,
|
||||
object: Union[Group, Dataset],
|
||||
):
|
||||
if isinstance(object, Group):
|
||||
new_group = output_h5.create_group(name)
|
||||
copy_attributes(object, new_group)
|
||||
elif isinstance(object, Dataset):
|
||||
# Compression only works for non-scalar Dataset objects
|
||||
# Scalar objects dont have a shape defined
|
||||
if not object.compression and object.shape not in [None, ()]:
|
||||
new_dataset = output_h5.create_dataset(
|
||||
name, data=object, compression=compression
|
||||
)
|
||||
copy_attributes(object, new_dataset)
|
||||
else:
|
||||
output_h5.copy(object, name)
|
||||
else:
|
||||
raise NotImplementedError(
|
||||
f"Could not copy element {name}, "
|
||||
f"type has not been implemented yet: {type(object)}"
|
||||
)
|
||||
|
||||
with (
|
||||
H5File(input_path, "r") as input_h5,
|
||||
H5File(output_path, "w", userblock_size=512) as output_h5,
|
||||
):
|
||||
copy_attributes(input_h5, output_h5)
|
||||
input_h5.visititems(partial(visit_path, output_h5, compression))
|
||||
|
||||
with open(input_path, "rb") as input_bytes:
|
||||
# Mudata puts metadata like this in the first 512 bytes:
|
||||
# MuData (format-version=0.1.0;creator=muon;creator-version=0.2.0)
|
||||
# See mudata/_core/io.py, read_h5mu() function
|
||||
starting_metadata = input_bytes.read(100)
|
||||
# The metadata is padded with extra null bytes up until 512 bytes
|
||||
truncate_location = starting_metadata.find(b"\x00")
|
||||
starting_metadata = starting_metadata[:truncate_location]
|
||||
with open(output_path, "br+") as f:
|
||||
nbytes = f.write(starting_metadata)
|
||||
f.write(b"\0" * (512 - nbytes))
|
||||
|
||||
|
||||
def write_h5ad_to_h5mu_with_compression(
|
||||
output_file: Union[str, Path],
|
||||
h5mu: Union[str, Path],
|
||||
modality_name: str,
|
||||
modality_data: AnnData,
|
||||
output_compression=None,
|
||||
):
|
||||
output_file = Path(output_file)
|
||||
h5mu = Path(h5mu)
|
||||
output_file_uncompressed = (
|
||||
output_file.with_name(output_file.stem + "_uncompressed.h5mu")
|
||||
if output_compression
|
||||
else output_file
|
||||
)
|
||||
shutil.copyfile(h5mu, output_file_uncompressed)
|
||||
write_h5ad(filename=output_file_uncompressed, mod=modality_name, data=modality_data)
|
||||
if output_compression:
|
||||
compress_h5mu(
|
||||
output_file_uncompressed, output_file, compression=output_compression
|
||||
)
|
||||
output_file_uncompressed.unlink()
|
||||
File diff suppressed because it is too large
Load Diff
@@ -0,0 +1,126 @@
|
||||
manifest {
|
||||
name = 'metadata/grep_annotation_column'
|
||||
mainScript = 'main.nf'
|
||||
nextflowVersion = '!>=20.12.1-edge'
|
||||
version = '2.1.2'
|
||||
description = 'Perform a regex lookup on a column from the annotation matrices .obs or .var.\nThe annotation matrix can originate from either a modality, or all modalities (global .var or .obs).\n'
|
||||
author = 'Dries Schaumont'
|
||||
}
|
||||
|
||||
process.container = 'nextflow/bash:latest'
|
||||
|
||||
// detect tempdir
|
||||
tempDir = java.nio.file.Paths.get(
|
||||
System.getenv('NXF_TEMP') ?:
|
||||
System.getenv('VIASH_TEMP') ?:
|
||||
System.getenv('TEMPDIR') ?:
|
||||
System.getenv('TMPDIR') ?:
|
||||
'/tmp'
|
||||
).toAbsolutePath()
|
||||
|
||||
profiles {
|
||||
no_publish {
|
||||
process {
|
||||
withName: '.*' {
|
||||
publishDir = [
|
||||
enabled: false
|
||||
]
|
||||
}
|
||||
}
|
||||
}
|
||||
mount_temp {
|
||||
docker.temp = tempDir
|
||||
podman.temp = tempDir
|
||||
charliecloud.temp = tempDir
|
||||
}
|
||||
docker {
|
||||
docker.enabled = true
|
||||
// docker.userEmulation = true
|
||||
singularity.enabled = false
|
||||
podman.enabled = false
|
||||
shifter.enabled = false
|
||||
charliecloud.enabled = false
|
||||
}
|
||||
singularity {
|
||||
singularity.enabled = true
|
||||
singularity.autoMounts = true
|
||||
docker.enabled = false
|
||||
podman.enabled = false
|
||||
shifter.enabled = false
|
||||
charliecloud.enabled = false
|
||||
}
|
||||
podman {
|
||||
podman.enabled = true
|
||||
docker.enabled = false
|
||||
singularity.enabled = false
|
||||
shifter.enabled = false
|
||||
charliecloud.enabled = false
|
||||
}
|
||||
shifter {
|
||||
shifter.enabled = true
|
||||
docker.enabled = false
|
||||
singularity.enabled = false
|
||||
podman.enabled = false
|
||||
charliecloud.enabled = false
|
||||
}
|
||||
charliecloud {
|
||||
charliecloud.enabled = true
|
||||
docker.enabled = false
|
||||
singularity.enabled = false
|
||||
podman.enabled = false
|
||||
shifter.enabled = false
|
||||
}
|
||||
}
|
||||
|
||||
process{
|
||||
withLabel: mem1gb { memory = 1000000000.B }
|
||||
withLabel: mem2gb { memory = 2000000000.B }
|
||||
withLabel: mem5gb { memory = 5000000000.B }
|
||||
withLabel: mem10gb { memory = 10000000000.B }
|
||||
withLabel: mem20gb { memory = 20000000000.B }
|
||||
withLabel: mem50gb { memory = 50000000000.B }
|
||||
withLabel: mem100gb { memory = 100000000000.B }
|
||||
withLabel: mem200gb { memory = 200000000000.B }
|
||||
withLabel: mem500gb { memory = 500000000000.B }
|
||||
withLabel: mem1tb { memory = 1000000000000.B }
|
||||
withLabel: mem2tb { memory = 2000000000000.B }
|
||||
withLabel: mem5tb { memory = 5000000000000.B }
|
||||
withLabel: mem10tb { memory = 10000000000000.B }
|
||||
withLabel: mem20tb { memory = 20000000000000.B }
|
||||
withLabel: mem50tb { memory = 50000000000000.B }
|
||||
withLabel: mem100tb { memory = 100000000000000.B }
|
||||
withLabel: mem200tb { memory = 200000000000000.B }
|
||||
withLabel: mem500tb { memory = 500000000000000.B }
|
||||
withLabel: mem1gib { memory = 1073741824.B }
|
||||
withLabel: mem2gib { memory = 2147483648.B }
|
||||
withLabel: mem4gib { memory = 4294967296.B }
|
||||
withLabel: mem8gib { memory = 8589934592.B }
|
||||
withLabel: mem16gib { memory = 17179869184.B }
|
||||
withLabel: mem32gib { memory = 34359738368.B }
|
||||
withLabel: mem64gib { memory = 68719476736.B }
|
||||
withLabel: mem128gib { memory = 137438953472.B }
|
||||
withLabel: mem256gib { memory = 274877906944.B }
|
||||
withLabel: mem512gib { memory = 549755813888.B }
|
||||
withLabel: mem1tib { memory = 1099511627776.B }
|
||||
withLabel: mem2tib { memory = 2199023255552.B }
|
||||
withLabel: mem4tib { memory = 4398046511104.B }
|
||||
withLabel: mem8tib { memory = 8796093022208.B }
|
||||
withLabel: mem16tib { memory = 17592186044416.B }
|
||||
withLabel: mem32tib { memory = 35184372088832.B }
|
||||
withLabel: mem64tib { memory = 70368744177664.B }
|
||||
withLabel: mem128tib { memory = 140737488355328.B }
|
||||
withLabel: mem256tib { memory = 281474976710656.B }
|
||||
withLabel: mem512tib { memory = 562949953421312.B }
|
||||
withLabel: cpu1 { cpus = 1 }
|
||||
withLabel: cpu2 { cpus = 2 }
|
||||
withLabel: cpu5 { cpus = 5 }
|
||||
withLabel: cpu10 { cpus = 10 }
|
||||
withLabel: cpu20 { cpus = 20 }
|
||||
withLabel: cpu50 { cpus = 50 }
|
||||
withLabel: cpu100 { cpus = 100 }
|
||||
withLabel: cpu200 { cpus = 200 }
|
||||
withLabel: cpu500 { cpus = 500 }
|
||||
withLabel: cpu1000 { cpus = 1000 }
|
||||
}
|
||||
|
||||
includeConfig("nextflow_labels.config")
|
||||
@@ -0,0 +1,66 @@
|
||||
process {
|
||||
// Default resources for components that hardly do any processing
|
||||
memory = { 2.GB * task.attempt }
|
||||
cpus = 1
|
||||
|
||||
// Retry for exit codes that have something to do with memory issues
|
||||
errorStrategy = { task.exitStatus in 137..140 ? 'retry' : 'terminate' }
|
||||
maxRetries = 3
|
||||
maxMemory = null
|
||||
|
||||
// CPU resources
|
||||
withLabel: singlecpu { cpus = 1 }
|
||||
withLabel: lowcpu { cpus = 4 }
|
||||
withLabel: midcpu { cpus = 10 }
|
||||
withLabel: highcpu { cpus = 20 }
|
||||
|
||||
// Memory resources
|
||||
withLabel: lowmem { memory = { get_memory( 4.GB * task.attempt ) } }
|
||||
withLabel: midmem { memory = { get_memory( 25.GB * task.attempt ) } }
|
||||
withLabel: highmem { memory = { get_memory( 50.GB * task.attempt ) } }
|
||||
withLabel: veryhighmem { memory = { get_memory( 75.GB * task.attempt ) } }
|
||||
|
||||
// Disk space
|
||||
withLabel: lowdisk {
|
||||
disk = {process.disk ? process.disk : null}
|
||||
}
|
||||
withLabel: middisk {
|
||||
disk = {process.disk ? process.disk : null}
|
||||
}
|
||||
withLabel: highdisk {
|
||||
disk = {process.disk ? process.disk : null}
|
||||
}
|
||||
withLabel: veryhighdisk {
|
||||
disk = {process.disk ? process.disk : null}
|
||||
}
|
||||
|
||||
// NOTE: The above labels intentionally do not have an effect by default.
|
||||
// The user should set the disk space requirements by adding the following
|
||||
// to the compute environment:
|
||||
//
|
||||
// withLabel: lowdisk { disk = { 20.GB * task.attempt } }
|
||||
// withLabel: middisk { disk = { 100.GB * task.attempt } }
|
||||
// withLabel: highdisk { disk = { 200.GB * task.attempt } }
|
||||
// withLabel: veryhighdisk { disk = { 500.GB * task.attempt } }
|
||||
}
|
||||
|
||||
def get_memory(to_compare) {
|
||||
if (!process.containsKey("maxMemory") || !process.maxMemory) {
|
||||
return to_compare
|
||||
}
|
||||
|
||||
try {
|
||||
if (process.containsKey("maxRetries") && process.maxRetries && task.attempt == (process.maxRetries as int)) {
|
||||
return process.maxMemory
|
||||
}
|
||||
else if (to_compare.compareTo(process.maxMemory as nextflow.util.MemoryUnit) == 1) {
|
||||
return max_memory as nextflow.util.MemoryUnit
|
||||
}
|
||||
else {
|
||||
return to_compare
|
||||
}
|
||||
} catch (all) {
|
||||
println "Error processing memory resources. Please check that process.maxMemory '${process.maxMemory}' and process.maxRetries '${process.maxRetries}' are valid!"
|
||||
System.exit(1)
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,21 @@
|
||||
# Inputs
|
||||
input: # please fill in - example: "sample_path"
|
||||
# input_column: "foo"
|
||||
# input_layer: "foo"
|
||||
modality: # please fill in - example: "rna"
|
||||
# matrix: "var"
|
||||
|
||||
# Outputs
|
||||
# output: "$id.$key.output.h5mu"
|
||||
# output_compression: "gzip"
|
||||
output_match_column: # please fill in - example: "foo"
|
||||
# output_fraction_column: "foo"
|
||||
|
||||
# Query options
|
||||
regex_pattern: # please fill in - example: "^[mM][tT]-"
|
||||
|
||||
# Nextflow input-output arguments
|
||||
publish_dir: # please fill in - example: "output/"
|
||||
# param_list: "my_params.yaml"
|
||||
|
||||
# Arguments
|
||||
@@ -0,0 +1,200 @@
|
||||
{
|
||||
"$schema": "http://json-schema.org/draft-07/schema",
|
||||
"title": "grep_annotation_column",
|
||||
"description": "Perform a regex lookup on a column from the annotation matrices .obs or .var.\nThe annotation matrix can originate from either a modality, or all modalities (global .var or .obs).\n",
|
||||
"type": "object",
|
||||
"definitions": {
|
||||
|
||||
|
||||
"Dataset input": {
|
||||
"title": "Dataset input",
|
||||
"type": "object",
|
||||
"description": "Dataset input using nf-tower \"dataset\" or \"data explorer\". Allows for the input of multiple parameter sets to initialise a Nextflow channel.",
|
||||
"properties": {
|
||||
"param_list": {
|
||||
"description": "Dataset input can either be a list of maps, a csv file, a json file, a yaml file, or simply a yaml blob. The names of the input fields (e.g. csv columns, json keys) need to be an exact match with the workflow input parameters.",
|
||||
"default": "",
|
||||
"format": "file-path",
|
||||
"mimetype": "text/csv",
|
||||
"pattern": "^\\S+\\.csv$"
|
||||
}
|
||||
}
|
||||
},
|
||||
|
||||
|
||||
|
||||
"inputs" : {
|
||||
"title": "Inputs",
|
||||
"type": "object",
|
||||
"description": "Arguments related to the input dataset.",
|
||||
"properties": {
|
||||
|
||||
|
||||
"input": {
|
||||
"type":
|
||||
"string",
|
||||
"description": "Type: `file`, required, example: `sample_path`. Path to the input ",
|
||||
"help_text": "Type: `file`, required, example: `sample_path`. Path to the input .h5mu."
|
||||
|
||||
}
|
||||
|
||||
|
||||
,
|
||||
"input_column": {
|
||||
"type":
|
||||
"string",
|
||||
"description": "Type: `string`. Column to query",
|
||||
"help_text": "Type: `string`. Column to query. If not specified, use .var_names or .obs_names, depending on the value of --matrix"
|
||||
|
||||
}
|
||||
|
||||
|
||||
,
|
||||
"input_layer": {
|
||||
"type":
|
||||
"string",
|
||||
"description": "Type: `string`. Input data to use when calculating fraction of observations that match with the query",
|
||||
"help_text": "Type: `string`. Input data to use when calculating fraction of observations that match with the query. \nOnly used when --output_fraction_column is provided. If not specified, .X is used.\n"
|
||||
|
||||
}
|
||||
|
||||
|
||||
,
|
||||
"modality": {
|
||||
"type":
|
||||
"string",
|
||||
"description": "Type: `string`, required, example: `rna`. Which modality to get the annotation matrix from",
|
||||
"help_text": "Type: `string`, required, example: `rna`. Which modality to get the annotation matrix from.\n"
|
||||
|
||||
}
|
||||
|
||||
|
||||
,
|
||||
"matrix": {
|
||||
"type":
|
||||
"string",
|
||||
"description": "Type: `string`, example: `var`, choices: ``var`, `obs``. Matrix to fetch the column from that will be searched",
|
||||
"help_text": "Type: `string`, example: `var`, choices: ``var`, `obs``. Matrix to fetch the column from that will be searched.",
|
||||
"enum": ["var", "obs"]
|
||||
|
||||
|
||||
}
|
||||
|
||||
|
||||
}
|
||||
},
|
||||
|
||||
|
||||
"outputs" : {
|
||||
"title": "Outputs",
|
||||
"type": "object",
|
||||
"description": "Arguments related to how the output will be written.",
|
||||
"properties": {
|
||||
|
||||
|
||||
"output": {
|
||||
"type":
|
||||
"string",
|
||||
"description": "Type: `file`, default: `$id.$key.output.h5mu`, example: `output.h5mu`. ",
|
||||
"help_text": "Type: `file`, default: `$id.$key.output.h5mu`, example: `output.h5mu`. "
|
||||
,
|
||||
"default":"$id.$key.output.h5mu"
|
||||
}
|
||||
|
||||
|
||||
,
|
||||
"output_compression": {
|
||||
"type":
|
||||
"string",
|
||||
"description": "Type: `string`, example: `gzip`, choices: ``gzip`, `lzf``. The compression format to be used on the output h5mu object",
|
||||
"help_text": "Type: `string`, example: `gzip`, choices: ``gzip`, `lzf``. The compression format to be used on the output h5mu object.",
|
||||
"enum": ["gzip", "lzf"]
|
||||
|
||||
|
||||
}
|
||||
|
||||
|
||||
,
|
||||
"output_match_column": {
|
||||
"type":
|
||||
"string",
|
||||
"description": "Type: `string`, required. Name of the column to write the result to",
|
||||
"help_text": "Type: `string`, required. Name of the column to write the result to."
|
||||
|
||||
}
|
||||
|
||||
|
||||
,
|
||||
"output_fraction_column": {
|
||||
"type":
|
||||
"string",
|
||||
"description": "Type: `string`. For the opposite axis, name of the column to write the fraction of \nobservations that matches to the pattern",
|
||||
"help_text": "Type: `string`. For the opposite axis, name of the column to write the fraction of \nobservations that matches to the pattern.\n"
|
||||
|
||||
}
|
||||
|
||||
|
||||
}
|
||||
},
|
||||
|
||||
|
||||
"query options" : {
|
||||
"title": "Query options",
|
||||
"type": "object",
|
||||
"description": "Options related to the query",
|
||||
"properties": {
|
||||
|
||||
|
||||
"regex_pattern": {
|
||||
"type":
|
||||
"string",
|
||||
"description": "Type: `string`, required, example: `^[mM][tT]-`. Regex to use to match with the input column",
|
||||
"help_text": "Type: `string`, required, example: `^[mM][tT]-`. Regex to use to match with the input column."
|
||||
|
||||
}
|
||||
|
||||
|
||||
}
|
||||
},
|
||||
|
||||
|
||||
"nextflow input-output arguments" : {
|
||||
"title": "Nextflow input-output arguments",
|
||||
"type": "object",
|
||||
"description": "Input/output parameters for Nextflow itself. Please note that both publishDir and publish_dir are supported but at least one has to be configured.",
|
||||
"properties": {
|
||||
|
||||
|
||||
"publish_dir": {
|
||||
"type":
|
||||
"string",
|
||||
"description": "Type: `string`, required, example: `output/`. Path to an output directory",
|
||||
"help_text": "Type: `string`, required, example: `output/`. Path to an output directory."
|
||||
|
||||
}
|
||||
|
||||
|
||||
|
||||
|
||||
}
|
||||
}
|
||||
},
|
||||
"allOf": [
|
||||
|
||||
{
|
||||
"$ref": "#/definitions/inputs"
|
||||
},
|
||||
|
||||
{
|
||||
"$ref": "#/definitions/outputs"
|
||||
},
|
||||
|
||||
{
|
||||
"$ref": "#/definitions/query options"
|
||||
},
|
||||
|
||||
{
|
||||
"$ref": "#/definitions/nextflow input-output arguments"
|
||||
}
|
||||
]
|
||||
}
|
||||
@@ -0,0 +1,12 @@
|
||||
def setup_logger():
|
||||
import logging
|
||||
from sys import stdout
|
||||
|
||||
logger = logging.getLogger()
|
||||
logger.setLevel(logging.INFO)
|
||||
console_handler = logging.StreamHandler(stdout)
|
||||
logFormatter = logging.Formatter("%(asctime)s %(levelname)-8s %(message)s")
|
||||
console_handler.setFormatter(logFormatter)
|
||||
logger.addHandler(console_handler)
|
||||
|
||||
return logger
|
||||
@@ -0,0 +1,376 @@
|
||||
name: "calculate_qc_metrics"
|
||||
namespace: "qc"
|
||||
version: "2.1.2"
|
||||
authors:
|
||||
- name: "Dries Schaumont"
|
||||
roles:
|
||||
- "author"
|
||||
info:
|
||||
role: "Core Team Member"
|
||||
links:
|
||||
email: "dries@data-intuitive.com"
|
||||
github: "DriesSchaumont"
|
||||
orcid: "0000-0002-4389-0440"
|
||||
linkedin: "dries-schaumont"
|
||||
organizations:
|
||||
- name: "Data Intuitive"
|
||||
href: "https://www.data-intuitive.com"
|
||||
role: "Data Scientist"
|
||||
argument_groups:
|
||||
- name: "Inputs"
|
||||
arguments:
|
||||
- type: "file"
|
||||
name: "--input"
|
||||
description: "Input h5mu file"
|
||||
info: null
|
||||
example:
|
||||
- "input.h5mu"
|
||||
must_exist: true
|
||||
create_parent: true
|
||||
required: true
|
||||
direction: "input"
|
||||
multiple: false
|
||||
multiple_sep: ";"
|
||||
- type: "string"
|
||||
name: "--modality"
|
||||
info: null
|
||||
default:
|
||||
- "rna"
|
||||
required: false
|
||||
direction: "input"
|
||||
multiple: false
|
||||
multiple_sep: ";"
|
||||
- type: "string"
|
||||
name: "--layer"
|
||||
info: null
|
||||
example:
|
||||
- "raw_counts"
|
||||
required: false
|
||||
direction: "input"
|
||||
multiple: false
|
||||
multiple_sep: ";"
|
||||
- name: "Metrics added to .obs"
|
||||
arguments:
|
||||
- type: "string"
|
||||
name: "--var_qc_metrics"
|
||||
description: "Keys to select a boolean (containing only True or False) column\
|
||||
\ from .var.\nFor each cell, calculate the proportion of total values for genes\
|
||||
\ which are labeled 'True', \ncompared to the total sum of the values for all\
|
||||
\ genes.\n"
|
||||
info: null
|
||||
example:
|
||||
- "ercc,highly_variable,mitochondrial"
|
||||
required: false
|
||||
direction: "input"
|
||||
multiple: true
|
||||
multiple_sep: ";"
|
||||
- type: "boolean"
|
||||
name: "--var_qc_metrics_fill_na_value"
|
||||
description: "Fill any 'NA' values found in the columns specified with --var_qc_metrics\
|
||||
\ to 'True' or 'False'.\nas False.\n"
|
||||
info: null
|
||||
required: false
|
||||
direction: "input"
|
||||
multiple: false
|
||||
multiple_sep: ";"
|
||||
- type: "integer"
|
||||
name: "--top_n_vars"
|
||||
description: "Number of top vars to be used to calculate cumulative proportions.\n\
|
||||
If not specified, proportions are not calculated. `--top_n_vars 20;50` finds\n\
|
||||
cumulative proportion to the 20th and 50th most expressed vars.\n"
|
||||
info: null
|
||||
required: false
|
||||
direction: "input"
|
||||
multiple: true
|
||||
multiple_sep: ";"
|
||||
- type: "string"
|
||||
name: "--output_obs_num_nonzero_vars"
|
||||
description: "Name of column in .obs describing, for each observation, the number\
|
||||
\ of stored values\n(including explicit zeroes). In other words, the name of\
|
||||
\ the column that counts\nfor each row the number of columns that contain data.\n"
|
||||
info: null
|
||||
default:
|
||||
- "num_nonzero_vars"
|
||||
required: false
|
||||
direction: "input"
|
||||
multiple: false
|
||||
multiple_sep: ";"
|
||||
- type: "string"
|
||||
name: "--output_obs_total_counts_vars"
|
||||
description: "Name of the column for .obs describing, for each observation (row),\n\
|
||||
the sum of the stored values in the columns.\n"
|
||||
info: null
|
||||
default:
|
||||
- "total_counts"
|
||||
required: false
|
||||
direction: "input"
|
||||
multiple: false
|
||||
multiple_sep: ";"
|
||||
- name: "Metrics added to .var"
|
||||
arguments:
|
||||
- type: "string"
|
||||
name: "--output_var_num_nonzero_obs"
|
||||
description: "Name of column describing, for each feature, the number of stored\
|
||||
\ values\n(including explicit zeroes). In other words, the name of the column\
|
||||
\ that counts\nfor each column the number of rows that contain data.\n"
|
||||
info: null
|
||||
default:
|
||||
- "num_nonzero_obs"
|
||||
required: false
|
||||
direction: "input"
|
||||
multiple: false
|
||||
multiple_sep: ";"
|
||||
- type: "string"
|
||||
name: "--output_var_total_counts_obs"
|
||||
description: "Name of the column in .var describing, for each feature (column),\n\
|
||||
the sum of the stored values in the rows.\n"
|
||||
info: null
|
||||
default:
|
||||
- "total_counts"
|
||||
required: false
|
||||
direction: "input"
|
||||
multiple: false
|
||||
multiple_sep: ";"
|
||||
- type: "string"
|
||||
name: "--output_var_obs_mean"
|
||||
description: "Name of the column in .obs providing the mean of the values in each\
|
||||
\ row.\n"
|
||||
info: null
|
||||
default:
|
||||
- "obs_mean"
|
||||
required: false
|
||||
direction: "input"
|
||||
multiple: false
|
||||
multiple_sep: ";"
|
||||
- type: "string"
|
||||
name: "--output_var_pct_dropout"
|
||||
description: "Name of the column in .obs providing for each feature the percentage\
|
||||
\ of\nobservations the feature does not appear on (i.e. is missing). Same as\
|
||||
\ `--num_nonzero_obs`\nbut percentage based.\n"
|
||||
info: null
|
||||
default:
|
||||
- "pct_dropout"
|
||||
required: false
|
||||
direction: "input"
|
||||
multiple: false
|
||||
multiple_sep: ";"
|
||||
- name: "Outputs"
|
||||
arguments:
|
||||
- type: "file"
|
||||
name: "--output"
|
||||
description: "Output h5mu file."
|
||||
info: null
|
||||
example:
|
||||
- "output.h5mu"
|
||||
must_exist: true
|
||||
create_parent: true
|
||||
required: false
|
||||
direction: "output"
|
||||
multiple: false
|
||||
multiple_sep: ";"
|
||||
- type: "string"
|
||||
name: "--output_compression"
|
||||
description: "The compression format to be used on the output h5mu object."
|
||||
info: null
|
||||
example:
|
||||
- "gzip"
|
||||
required: false
|
||||
choices:
|
||||
- "gzip"
|
||||
- "lzf"
|
||||
direction: "input"
|
||||
multiple: false
|
||||
multiple_sep: ";"
|
||||
resources:
|
||||
- type: "python_script"
|
||||
path: "script.py"
|
||||
is_executable: true
|
||||
- type: "file"
|
||||
path: "setup_logger.py"
|
||||
- type: "file"
|
||||
path: "compress_h5mu.py"
|
||||
- type: "file"
|
||||
path: "nextflow_labels.config"
|
||||
dest: "nextflow_labels.config"
|
||||
description: "Add basic quality control metrics to an .h5mu file.\n\nThe metrics are\
|
||||
\ comparable to what scanpy.pp.calculate_qc_metrics output,\nalthough they have\
|
||||
\ slightly different names:\n\nVar metrics (name in this component -> name in scanpy):\n\
|
||||
\ - pct_dropout -> pct_dropout_by_{expr_type}\n - num_nonzero_obs -> n_cells_by_{expr_type}\n\
|
||||
\ - obs_mean -> mean_{expr_type}\n - total_counts -> total_{expr_type}\n\n Obs\
|
||||
\ metrics:\n - num_nonzero_vars -> n_genes_by_{expr_type}\n - pct_{var_qc_metrics}\
|
||||
\ -> pct_{expr_type}_{qc_var}\n - total_counts_{var_qc_metrics} -> total_{expr_type}_{qc_var}\n\
|
||||
\ - pct_of_counts_in_top_{top_n_vars}_vars -> pct_{expr_type}_in_top_{n}_{var_type}\n\
|
||||
\ - total_counts -> total_{expr_type}\n \n"
|
||||
test_resources:
|
||||
- type: "python_script"
|
||||
path: "test.py"
|
||||
is_executable: true
|
||||
- type: "file"
|
||||
path: "pbmc_1k_protein_v3_filtered_feature_bc_matrix.h5mu"
|
||||
info: null
|
||||
status: "enabled"
|
||||
scope:
|
||||
image: "public"
|
||||
target: "public"
|
||||
license: "MIT"
|
||||
links:
|
||||
repository: "https://github.com/openpipelines-bio/openpipeline"
|
||||
docker_registry: "ghcr.io"
|
||||
runners:
|
||||
- type: "executable"
|
||||
id: "executable"
|
||||
docker_setup_strategy: "ifneedbepullelsecachedbuild"
|
||||
- type: "nextflow"
|
||||
id: "nextflow"
|
||||
directives:
|
||||
label:
|
||||
- "singlecpu"
|
||||
- "midmem"
|
||||
tag: "$id"
|
||||
auto:
|
||||
simplifyInput: true
|
||||
simplifyOutput: false
|
||||
transcript: false
|
||||
publish: false
|
||||
config:
|
||||
labels:
|
||||
mem1gb: "memory = 1000000000.B"
|
||||
mem2gb: "memory = 2000000000.B"
|
||||
mem5gb: "memory = 5000000000.B"
|
||||
mem10gb: "memory = 10000000000.B"
|
||||
mem20gb: "memory = 20000000000.B"
|
||||
mem50gb: "memory = 50000000000.B"
|
||||
mem100gb: "memory = 100000000000.B"
|
||||
mem200gb: "memory = 200000000000.B"
|
||||
mem500gb: "memory = 500000000000.B"
|
||||
mem1tb: "memory = 1000000000000.B"
|
||||
mem2tb: "memory = 2000000000000.B"
|
||||
mem5tb: "memory = 5000000000000.B"
|
||||
mem10tb: "memory = 10000000000000.B"
|
||||
mem20tb: "memory = 20000000000000.B"
|
||||
mem50tb: "memory = 50000000000000.B"
|
||||
mem100tb: "memory = 100000000000000.B"
|
||||
mem200tb: "memory = 200000000000000.B"
|
||||
mem500tb: "memory = 500000000000000.B"
|
||||
mem1gib: "memory = 1073741824.B"
|
||||
mem2gib: "memory = 2147483648.B"
|
||||
mem4gib: "memory = 4294967296.B"
|
||||
mem8gib: "memory = 8589934592.B"
|
||||
mem16gib: "memory = 17179869184.B"
|
||||
mem32gib: "memory = 34359738368.B"
|
||||
mem64gib: "memory = 68719476736.B"
|
||||
mem128gib: "memory = 137438953472.B"
|
||||
mem256gib: "memory = 274877906944.B"
|
||||
mem512gib: "memory = 549755813888.B"
|
||||
mem1tib: "memory = 1099511627776.B"
|
||||
mem2tib: "memory = 2199023255552.B"
|
||||
mem4tib: "memory = 4398046511104.B"
|
||||
mem8tib: "memory = 8796093022208.B"
|
||||
mem16tib: "memory = 17592186044416.B"
|
||||
mem32tib: "memory = 35184372088832.B"
|
||||
mem64tib: "memory = 70368744177664.B"
|
||||
mem128tib: "memory = 140737488355328.B"
|
||||
mem256tib: "memory = 281474976710656.B"
|
||||
mem512tib: "memory = 562949953421312.B"
|
||||
cpu1: "cpus = 1"
|
||||
cpu2: "cpus = 2"
|
||||
cpu5: "cpus = 5"
|
||||
cpu10: "cpus = 10"
|
||||
cpu20: "cpus = 20"
|
||||
cpu50: "cpus = 50"
|
||||
cpu100: "cpus = 100"
|
||||
cpu200: "cpus = 200"
|
||||
cpu500: "cpus = 500"
|
||||
cpu1000: "cpus = 1000"
|
||||
script:
|
||||
- "includeConfig(\"nextflow_labels.config\")"
|
||||
debug: false
|
||||
container: "docker"
|
||||
engines:
|
||||
- type: "docker"
|
||||
id: "docker"
|
||||
image: "python:3.11-slim"
|
||||
target_tag: "2.1.0"
|
||||
namespace_separator: "/"
|
||||
setup:
|
||||
- type: "apt"
|
||||
packages:
|
||||
- "procps"
|
||||
interactive: false
|
||||
- type: "python"
|
||||
user: false
|
||||
packages:
|
||||
- "anndata~=0.11.1"
|
||||
- "mudata~=0.3.1"
|
||||
- "scipy"
|
||||
script:
|
||||
- "exec(\"try:\\n import awkward\\nexcept ModuleNotFoundError:\\n exit(0)\\\
|
||||
nelse: exit(1)\")"
|
||||
upgrade: true
|
||||
test_setup:
|
||||
- type: "apt"
|
||||
packages:
|
||||
- "git"
|
||||
interactive: false
|
||||
- type: "python"
|
||||
user: false
|
||||
packages:
|
||||
- "viashpy==0.8.0"
|
||||
github:
|
||||
- "openpipelines-bio/core#subdirectory=packages/python/openpipeline_testutils"
|
||||
upgrade: true
|
||||
- type: "python"
|
||||
user: false
|
||||
packages:
|
||||
- "scanpy"
|
||||
upgrade: true
|
||||
entrypoint: []
|
||||
cmd: null
|
||||
build_info:
|
||||
config: "src/qc/calculate_qc_metrics/config.vsh.yaml"
|
||||
runner: "nextflow"
|
||||
engine: "docker"
|
||||
output: "target/nextflow/qc/calculate_qc_metrics"
|
||||
executable: "target/nextflow/qc/calculate_qc_metrics/main.nf"
|
||||
viash_version: "0.9.4"
|
||||
git_commit: "a0c9522486585774f76416150f8a3291409b5363"
|
||||
git_remote: "https://github.com/openpipelines-bio/openpipeline"
|
||||
git_tag: "2.1.1-2-ga0c95224865"
|
||||
package_config:
|
||||
name: "openpipeline"
|
||||
summary: "Best-practice workflows for single-cell multi-omics analyses.\n"
|
||||
description: "OpenPipelines are extensible single cell analysis pipelines for reproducible\
|
||||
\ and large-scale single cell processing using [Viash](https://viash.io) and [Nextflow](https://www.nextflow.io/).\n\
|
||||
\nIn terms of workflows, the following has been made available, but keep in mind\
|
||||
\ that\nindividual tools and functionality can be executed as standalone components\
|
||||
\ as well.\n\n * Demultiplexing: conversion of raw sequencing data to FASTQ objects.\n\
|
||||
\ * Ingestion: Read mapping and generating a count matrix.\n * Single sample\
|
||||
\ processing: cell filtering and doublet detection.\n * Multisample processing:\
|
||||
\ Count transformation, normalization, QC metric calulations.\n * Integration:\
|
||||
\ Clustering, integration and batch correction using single and multimodal methods.\n\
|
||||
\ * Downstream analysis workflows\n"
|
||||
info:
|
||||
test_resources:
|
||||
- type: "s3"
|
||||
path: "s3://openpipelines-data"
|
||||
dest: "resources_test"
|
||||
viash_version: "0.9.4"
|
||||
source: "src"
|
||||
target: "target"
|
||||
config_mods:
|
||||
- ".resources += {path: '/src/workflows/utils/labels.config', dest: 'nextflow_labels.config'}\n\
|
||||
.runners[.type == 'nextflow'].config.script := 'includeConfig(\"nextflow_labels.config\"\
|
||||
)'"
|
||||
- ".version := \"2.1.2\""
|
||||
- ".engines[.type == 'docker'].target_tag := '2.1.0'"
|
||||
keywords:
|
||||
- "single-cell"
|
||||
- "multimodal"
|
||||
license: "MIT"
|
||||
organization: "openpipelines-bio"
|
||||
links:
|
||||
repository: "https://github.com/openpipelines-bio/openpipeline"
|
||||
docker_registry: "ghcr.io"
|
||||
homepage: "https://openpipelines.bio"
|
||||
documentation: "https://openpipelines.bio/fundamentals"
|
||||
issue_tracker: "https://github.com/openpipelines-bio/openpipeline/issues"
|
||||
@@ -0,0 +1,87 @@
|
||||
import shutil
|
||||
from anndata import AnnData
|
||||
from mudata import write_h5ad
|
||||
from h5py import File as H5File
|
||||
from h5py import Group, Dataset
|
||||
from pathlib import Path
|
||||
from typing import Union, Literal
|
||||
from functools import partial
|
||||
|
||||
|
||||
def compress_h5mu(
|
||||
input_path: Union[str, Path],
|
||||
output_path: Union[str, Path],
|
||||
compression: Union[Literal["gzip"], Literal["lzf"]],
|
||||
):
|
||||
input_path, output_path = str(input_path), str(output_path)
|
||||
|
||||
def copy_attributes(in_object, out_object):
|
||||
for key, value in in_object.attrs.items():
|
||||
out_object.attrs[key] = value
|
||||
|
||||
def visit_path(
|
||||
output_h5: H5File,
|
||||
compression: Union[Literal["gzip"], Literal["lzf"]],
|
||||
name: str,
|
||||
object: Union[Group, Dataset],
|
||||
):
|
||||
if isinstance(object, Group):
|
||||
new_group = output_h5.create_group(name)
|
||||
copy_attributes(object, new_group)
|
||||
elif isinstance(object, Dataset):
|
||||
# Compression only works for non-scalar Dataset objects
|
||||
# Scalar objects dont have a shape defined
|
||||
if not object.compression and object.shape not in [None, ()]:
|
||||
new_dataset = output_h5.create_dataset(
|
||||
name, data=object, compression=compression
|
||||
)
|
||||
copy_attributes(object, new_dataset)
|
||||
else:
|
||||
output_h5.copy(object, name)
|
||||
else:
|
||||
raise NotImplementedError(
|
||||
f"Could not copy element {name}, "
|
||||
f"type has not been implemented yet: {type(object)}"
|
||||
)
|
||||
|
||||
with (
|
||||
H5File(input_path, "r") as input_h5,
|
||||
H5File(output_path, "w", userblock_size=512) as output_h5,
|
||||
):
|
||||
copy_attributes(input_h5, output_h5)
|
||||
input_h5.visititems(partial(visit_path, output_h5, compression))
|
||||
|
||||
with open(input_path, "rb") as input_bytes:
|
||||
# Mudata puts metadata like this in the first 512 bytes:
|
||||
# MuData (format-version=0.1.0;creator=muon;creator-version=0.2.0)
|
||||
# See mudata/_core/io.py, read_h5mu() function
|
||||
starting_metadata = input_bytes.read(100)
|
||||
# The metadata is padded with extra null bytes up until 512 bytes
|
||||
truncate_location = starting_metadata.find(b"\x00")
|
||||
starting_metadata = starting_metadata[:truncate_location]
|
||||
with open(output_path, "br+") as f:
|
||||
nbytes = f.write(starting_metadata)
|
||||
f.write(b"\0" * (512 - nbytes))
|
||||
|
||||
|
||||
def write_h5ad_to_h5mu_with_compression(
|
||||
output_file: Union[str, Path],
|
||||
h5mu: Union[str, Path],
|
||||
modality_name: str,
|
||||
modality_data: AnnData,
|
||||
output_compression=None,
|
||||
):
|
||||
output_file = Path(output_file)
|
||||
h5mu = Path(h5mu)
|
||||
output_file_uncompressed = (
|
||||
output_file.with_name(output_file.stem + "_uncompressed.h5mu")
|
||||
if output_compression
|
||||
else output_file
|
||||
)
|
||||
shutil.copyfile(h5mu, output_file_uncompressed)
|
||||
write_h5ad(filename=output_file_uncompressed, mod=modality_name, data=modality_data)
|
||||
if output_compression:
|
||||
compress_h5mu(
|
||||
output_file_uncompressed, output_file, compression=output_compression
|
||||
)
|
||||
output_file_uncompressed.unlink()
|
||||
File diff suppressed because it is too large
Load Diff
@@ -0,0 +1,126 @@
|
||||
manifest {
|
||||
name = 'qc/calculate_qc_metrics'
|
||||
mainScript = 'main.nf'
|
||||
nextflowVersion = '!>=20.12.1-edge'
|
||||
version = '2.1.2'
|
||||
description = 'Add basic quality control metrics to an .h5mu file.\n\nThe metrics are comparable to what scanpy.pp.calculate_qc_metrics output,\nalthough they have slightly different names:\n\nVar metrics (name in this component -> name in scanpy):\n - pct_dropout -> pct_dropout_by_{expr_type}\n - num_nonzero_obs -> n_cells_by_{expr_type}\n - obs_mean -> mean_{expr_type}\n - total_counts -> total_{expr_type}\n\n Obs metrics:\n - num_nonzero_vars -> n_genes_by_{expr_type}\n - pct_{var_qc_metrics} -> pct_{expr_type}_{qc_var}\n - total_counts_{var_qc_metrics} -> total_{expr_type}_{qc_var}\n - pct_of_counts_in_top_{top_n_vars}_vars -> pct_{expr_type}_in_top_{n}_{var_type}\n - total_counts -> total_{expr_type}\n \n'
|
||||
author = 'Dries Schaumont'
|
||||
}
|
||||
|
||||
process.container = 'nextflow/bash:latest'
|
||||
|
||||
// detect tempdir
|
||||
tempDir = java.nio.file.Paths.get(
|
||||
System.getenv('NXF_TEMP') ?:
|
||||
System.getenv('VIASH_TEMP') ?:
|
||||
System.getenv('TEMPDIR') ?:
|
||||
System.getenv('TMPDIR') ?:
|
||||
'/tmp'
|
||||
).toAbsolutePath()
|
||||
|
||||
profiles {
|
||||
no_publish {
|
||||
process {
|
||||
withName: '.*' {
|
||||
publishDir = [
|
||||
enabled: false
|
||||
]
|
||||
}
|
||||
}
|
||||
}
|
||||
mount_temp {
|
||||
docker.temp = tempDir
|
||||
podman.temp = tempDir
|
||||
charliecloud.temp = tempDir
|
||||
}
|
||||
docker {
|
||||
docker.enabled = true
|
||||
// docker.userEmulation = true
|
||||
singularity.enabled = false
|
||||
podman.enabled = false
|
||||
shifter.enabled = false
|
||||
charliecloud.enabled = false
|
||||
}
|
||||
singularity {
|
||||
singularity.enabled = true
|
||||
singularity.autoMounts = true
|
||||
docker.enabled = false
|
||||
podman.enabled = false
|
||||
shifter.enabled = false
|
||||
charliecloud.enabled = false
|
||||
}
|
||||
podman {
|
||||
podman.enabled = true
|
||||
docker.enabled = false
|
||||
singularity.enabled = false
|
||||
shifter.enabled = false
|
||||
charliecloud.enabled = false
|
||||
}
|
||||
shifter {
|
||||
shifter.enabled = true
|
||||
docker.enabled = false
|
||||
singularity.enabled = false
|
||||
podman.enabled = false
|
||||
charliecloud.enabled = false
|
||||
}
|
||||
charliecloud {
|
||||
charliecloud.enabled = true
|
||||
docker.enabled = false
|
||||
singularity.enabled = false
|
||||
podman.enabled = false
|
||||
shifter.enabled = false
|
||||
}
|
||||
}
|
||||
|
||||
process{
|
||||
withLabel: mem1gb { memory = 1000000000.B }
|
||||
withLabel: mem2gb { memory = 2000000000.B }
|
||||
withLabel: mem5gb { memory = 5000000000.B }
|
||||
withLabel: mem10gb { memory = 10000000000.B }
|
||||
withLabel: mem20gb { memory = 20000000000.B }
|
||||
withLabel: mem50gb { memory = 50000000000.B }
|
||||
withLabel: mem100gb { memory = 100000000000.B }
|
||||
withLabel: mem200gb { memory = 200000000000.B }
|
||||
withLabel: mem500gb { memory = 500000000000.B }
|
||||
withLabel: mem1tb { memory = 1000000000000.B }
|
||||
withLabel: mem2tb { memory = 2000000000000.B }
|
||||
withLabel: mem5tb { memory = 5000000000000.B }
|
||||
withLabel: mem10tb { memory = 10000000000000.B }
|
||||
withLabel: mem20tb { memory = 20000000000000.B }
|
||||
withLabel: mem50tb { memory = 50000000000000.B }
|
||||
withLabel: mem100tb { memory = 100000000000000.B }
|
||||
withLabel: mem200tb { memory = 200000000000000.B }
|
||||
withLabel: mem500tb { memory = 500000000000000.B }
|
||||
withLabel: mem1gib { memory = 1073741824.B }
|
||||
withLabel: mem2gib { memory = 2147483648.B }
|
||||
withLabel: mem4gib { memory = 4294967296.B }
|
||||
withLabel: mem8gib { memory = 8589934592.B }
|
||||
withLabel: mem16gib { memory = 17179869184.B }
|
||||
withLabel: mem32gib { memory = 34359738368.B }
|
||||
withLabel: mem64gib { memory = 68719476736.B }
|
||||
withLabel: mem128gib { memory = 137438953472.B }
|
||||
withLabel: mem256gib { memory = 274877906944.B }
|
||||
withLabel: mem512gib { memory = 549755813888.B }
|
||||
withLabel: mem1tib { memory = 1099511627776.B }
|
||||
withLabel: mem2tib { memory = 2199023255552.B }
|
||||
withLabel: mem4tib { memory = 4398046511104.B }
|
||||
withLabel: mem8tib { memory = 8796093022208.B }
|
||||
withLabel: mem16tib { memory = 17592186044416.B }
|
||||
withLabel: mem32tib { memory = 35184372088832.B }
|
||||
withLabel: mem64tib { memory = 70368744177664.B }
|
||||
withLabel: mem128tib { memory = 140737488355328.B }
|
||||
withLabel: mem256tib { memory = 281474976710656.B }
|
||||
withLabel: mem512tib { memory = 562949953421312.B }
|
||||
withLabel: cpu1 { cpus = 1 }
|
||||
withLabel: cpu2 { cpus = 2 }
|
||||
withLabel: cpu5 { cpus = 5 }
|
||||
withLabel: cpu10 { cpus = 10 }
|
||||
withLabel: cpu20 { cpus = 20 }
|
||||
withLabel: cpu50 { cpus = 50 }
|
||||
withLabel: cpu100 { cpus = 100 }
|
||||
withLabel: cpu200 { cpus = 200 }
|
||||
withLabel: cpu500 { cpus = 500 }
|
||||
withLabel: cpu1000 { cpus = 1000 }
|
||||
}
|
||||
|
||||
includeConfig("nextflow_labels.config")
|
||||
@@ -0,0 +1,66 @@
|
||||
process {
|
||||
// Default resources for components that hardly do any processing
|
||||
memory = { 2.GB * task.attempt }
|
||||
cpus = 1
|
||||
|
||||
// Retry for exit codes that have something to do with memory issues
|
||||
errorStrategy = { task.exitStatus in 137..140 ? 'retry' : 'terminate' }
|
||||
maxRetries = 3
|
||||
maxMemory = null
|
||||
|
||||
// CPU resources
|
||||
withLabel: singlecpu { cpus = 1 }
|
||||
withLabel: lowcpu { cpus = 4 }
|
||||
withLabel: midcpu { cpus = 10 }
|
||||
withLabel: highcpu { cpus = 20 }
|
||||
|
||||
// Memory resources
|
||||
withLabel: lowmem { memory = { get_memory( 4.GB * task.attempt ) } }
|
||||
withLabel: midmem { memory = { get_memory( 25.GB * task.attempt ) } }
|
||||
withLabel: highmem { memory = { get_memory( 50.GB * task.attempt ) } }
|
||||
withLabel: veryhighmem { memory = { get_memory( 75.GB * task.attempt ) } }
|
||||
|
||||
// Disk space
|
||||
withLabel: lowdisk {
|
||||
disk = {process.disk ? process.disk : null}
|
||||
}
|
||||
withLabel: middisk {
|
||||
disk = {process.disk ? process.disk : null}
|
||||
}
|
||||
withLabel: highdisk {
|
||||
disk = {process.disk ? process.disk : null}
|
||||
}
|
||||
withLabel: veryhighdisk {
|
||||
disk = {process.disk ? process.disk : null}
|
||||
}
|
||||
|
||||
// NOTE: The above labels intentionally do not have an effect by default.
|
||||
// The user should set the disk space requirements by adding the following
|
||||
// to the compute environment:
|
||||
//
|
||||
// withLabel: lowdisk { disk = { 20.GB * task.attempt } }
|
||||
// withLabel: middisk { disk = { 100.GB * task.attempt } }
|
||||
// withLabel: highdisk { disk = { 200.GB * task.attempt } }
|
||||
// withLabel: veryhighdisk { disk = { 500.GB * task.attempt } }
|
||||
}
|
||||
|
||||
def get_memory(to_compare) {
|
||||
if (!process.containsKey("maxMemory") || !process.maxMemory) {
|
||||
return to_compare
|
||||
}
|
||||
|
||||
try {
|
||||
if (process.containsKey("maxRetries") && process.maxRetries && task.attempt == (process.maxRetries as int)) {
|
||||
return process.maxMemory
|
||||
}
|
||||
else if (to_compare.compareTo(process.maxMemory as nextflow.util.MemoryUnit) == 1) {
|
||||
return max_memory as nextflow.util.MemoryUnit
|
||||
}
|
||||
else {
|
||||
return to_compare
|
||||
}
|
||||
} catch (all) {
|
||||
println "Error processing memory resources. Please check that process.maxMemory '${process.maxMemory}' and process.maxRetries '${process.maxRetries}' are valid!"
|
||||
System.exit(1)
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,27 @@
|
||||
# Inputs
|
||||
input: # please fill in - example: "input.h5mu"
|
||||
modality: "rna"
|
||||
# layer: "raw_counts"
|
||||
|
||||
# Metrics added to .obs
|
||||
# var_qc_metrics: ["ercc,highly_variable,mitochondrial"]
|
||||
# var_qc_metrics_fill_na_value: true
|
||||
# top_n_vars: [123]
|
||||
output_obs_num_nonzero_vars: "num_nonzero_vars"
|
||||
output_obs_total_counts_vars: "total_counts"
|
||||
|
||||
# Metrics added to .var
|
||||
output_var_num_nonzero_obs: "num_nonzero_obs"
|
||||
output_var_total_counts_obs: "total_counts"
|
||||
output_var_obs_mean: "obs_mean"
|
||||
output_var_pct_dropout: "pct_dropout"
|
||||
|
||||
# Outputs
|
||||
# output: "$id.$key.output.h5mu"
|
||||
# output_compression: "gzip"
|
||||
|
||||
# Nextflow input-output arguments
|
||||
publish_dir: # please fill in - example: "output/"
|
||||
# param_list: "my_params.yaml"
|
||||
|
||||
# Arguments
|
||||
@@ -0,0 +1,259 @@
|
||||
{
|
||||
"$schema": "http://json-schema.org/draft-07/schema",
|
||||
"title": "calculate_qc_metrics",
|
||||
"description": "Add basic quality control metrics to an .h5mu file.\n\nThe metrics are comparable to what scanpy.pp.calculate_qc_metrics output,\nalthough they have slightly different names:\n\nVar metrics (name in this component -\u003e name in scanpy):\n - pct_dropout -\u003e pct_dropout_by_{expr_type}\n - num_nonzero_obs -\u003e n_cells_by_{expr_type}\n - obs_mean -\u003e mean_{expr_type}\n - total_counts -\u003e total_{expr_type}\n\n Obs metrics:\n - num_nonzero_vars -\u003e n_genes_by_{expr_type}\n - pct_{var_qc_metrics} -\u003e pct_{expr_type}_{qc_var}\n - total_counts_{var_qc_metrics} -\u003e total_{expr_type}_{qc_var}\n - pct_of_counts_in_top_{top_n_vars}_vars -\u003e pct_{expr_type}_in_top_{n}_{var_type}\n - total_counts -\u003e total_{expr_type}\n \n",
|
||||
"type": "object",
|
||||
"definitions": {
|
||||
|
||||
|
||||
"Dataset input": {
|
||||
"title": "Dataset input",
|
||||
"type": "object",
|
||||
"description": "Dataset input using nf-tower \"dataset\" or \"data explorer\". Allows for the input of multiple parameter sets to initialise a Nextflow channel.",
|
||||
"properties": {
|
||||
"param_list": {
|
||||
"description": "Dataset input can either be a list of maps, a csv file, a json file, a yaml file, or simply a yaml blob. The names of the input fields (e.g. csv columns, json keys) need to be an exact match with the workflow input parameters.",
|
||||
"default": "",
|
||||
"format": "file-path",
|
||||
"mimetype": "text/csv",
|
||||
"pattern": "^\\S+\\.csv$"
|
||||
}
|
||||
}
|
||||
},
|
||||
|
||||
|
||||
|
||||
"inputs" : {
|
||||
"title": "Inputs",
|
||||
"type": "object",
|
||||
"description": "No description",
|
||||
"properties": {
|
||||
|
||||
|
||||
"input": {
|
||||
"type":
|
||||
"string",
|
||||
"description": "Type: `file`, required, example: `input.h5mu`. Input h5mu file",
|
||||
"help_text": "Type: `file`, required, example: `input.h5mu`. Input h5mu file"
|
||||
|
||||
}
|
||||
|
||||
|
||||
,
|
||||
"modality": {
|
||||
"type":
|
||||
"string",
|
||||
"description": "Type: `string`, default: `rna`. ",
|
||||
"help_text": "Type: `string`, default: `rna`. "
|
||||
,
|
||||
"default":"rna"
|
||||
}
|
||||
|
||||
|
||||
,
|
||||
"layer": {
|
||||
"type":
|
||||
"string",
|
||||
"description": "Type: `string`, example: `raw_counts`. ",
|
||||
"help_text": "Type: `string`, example: `raw_counts`. "
|
||||
|
||||
}
|
||||
|
||||
|
||||
}
|
||||
},
|
||||
|
||||
|
||||
"outputs" : {
|
||||
"title": "Outputs",
|
||||
"type": "object",
|
||||
"description": "No description",
|
||||
"properties": {
|
||||
|
||||
|
||||
"output": {
|
||||
"type":
|
||||
"string",
|
||||
"description": "Type: `file`, default: `$id.$key.output.h5mu`, example: `output.h5mu`. Output h5mu file",
|
||||
"help_text": "Type: `file`, default: `$id.$key.output.h5mu`, example: `output.h5mu`. Output h5mu file."
|
||||
,
|
||||
"default":"$id.$key.output.h5mu"
|
||||
}
|
||||
|
||||
|
||||
,
|
||||
"output_compression": {
|
||||
"type":
|
||||
"string",
|
||||
"description": "Type: `string`, example: `gzip`, choices: ``gzip`, `lzf``. The compression format to be used on the output h5mu object",
|
||||
"help_text": "Type: `string`, example: `gzip`, choices: ``gzip`, `lzf``. The compression format to be used on the output h5mu object.",
|
||||
"enum": ["gzip", "lzf"]
|
||||
|
||||
|
||||
}
|
||||
|
||||
|
||||
}
|
||||
},
|
||||
|
||||
|
||||
"metrics added to .obs" : {
|
||||
"title": "Metrics added to .obs",
|
||||
"type": "object",
|
||||
"description": "No description",
|
||||
"properties": {
|
||||
|
||||
|
||||
"var_qc_metrics": {
|
||||
"type":
|
||||
"string",
|
||||
"description": "Type: List of `string`, example: `ercc,highly_variable,mitochondrial`, multiple_sep: `\";\"`. Keys to select a boolean (containing only True or False) column from ",
|
||||
"help_text": "Type: List of `string`, example: `ercc,highly_variable,mitochondrial`, multiple_sep: `\";\"`. Keys to select a boolean (containing only True or False) column from .var.\nFor each cell, calculate the proportion of total values for genes which are labeled \u0027True\u0027, \ncompared to the total sum of the values for all genes.\n"
|
||||
|
||||
}
|
||||
|
||||
|
||||
,
|
||||
"var_qc_metrics_fill_na_value": {
|
||||
"type":
|
||||
"boolean",
|
||||
"description": "Type: `boolean`. Fill any \u0027NA\u0027 values found in the columns specified with --var_qc_metrics to \u0027True\u0027 or \u0027False\u0027",
|
||||
"help_text": "Type: `boolean`. Fill any \u0027NA\u0027 values found in the columns specified with --var_qc_metrics to \u0027True\u0027 or \u0027False\u0027.\nas False.\n"
|
||||
|
||||
}
|
||||
|
||||
|
||||
,
|
||||
"top_n_vars": {
|
||||
"type":
|
||||
"string",
|
||||
"description": "Type: List of `integer`, multiple_sep: `\";\"`. Number of top vars to be used to calculate cumulative proportions",
|
||||
"help_text": "Type: List of `integer`, multiple_sep: `\";\"`. Number of top vars to be used to calculate cumulative proportions.\nIf not specified, proportions are not calculated. `--top_n_vars 20;50` finds\ncumulative proportion to the 20th and 50th most expressed vars.\n"
|
||||
|
||||
}
|
||||
|
||||
|
||||
,
|
||||
"output_obs_num_nonzero_vars": {
|
||||
"type":
|
||||
"string",
|
||||
"description": "Type: `string`, default: `num_nonzero_vars`. Name of column in ",
|
||||
"help_text": "Type: `string`, default: `num_nonzero_vars`. Name of column in .obs describing, for each observation, the number of stored values\n(including explicit zeroes). In other words, the name of the column that counts\nfor each row the number of columns that contain data.\n"
|
||||
,
|
||||
"default":"num_nonzero_vars"
|
||||
}
|
||||
|
||||
|
||||
,
|
||||
"output_obs_total_counts_vars": {
|
||||
"type":
|
||||
"string",
|
||||
"description": "Type: `string`, default: `total_counts`. Name of the column for ",
|
||||
"help_text": "Type: `string`, default: `total_counts`. Name of the column for .obs describing, for each observation (row),\nthe sum of the stored values in the columns.\n"
|
||||
,
|
||||
"default":"total_counts"
|
||||
}
|
||||
|
||||
|
||||
}
|
||||
},
|
||||
|
||||
|
||||
"metrics added to .var" : {
|
||||
"title": "Metrics added to .var",
|
||||
"type": "object",
|
||||
"description": "No description",
|
||||
"properties": {
|
||||
|
||||
|
||||
"output_var_num_nonzero_obs": {
|
||||
"type":
|
||||
"string",
|
||||
"description": "Type: `string`, default: `num_nonzero_obs`. Name of column describing, for each feature, the number of stored values\n(including explicit zeroes)",
|
||||
"help_text": "Type: `string`, default: `num_nonzero_obs`. Name of column describing, for each feature, the number of stored values\n(including explicit zeroes). In other words, the name of the column that counts\nfor each column the number of rows that contain data.\n"
|
||||
,
|
||||
"default":"num_nonzero_obs"
|
||||
}
|
||||
|
||||
|
||||
,
|
||||
"output_var_total_counts_obs": {
|
||||
"type":
|
||||
"string",
|
||||
"description": "Type: `string`, default: `total_counts`. Name of the column in ",
|
||||
"help_text": "Type: `string`, default: `total_counts`. Name of the column in .var describing, for each feature (column),\nthe sum of the stored values in the rows.\n"
|
||||
,
|
||||
"default":"total_counts"
|
||||
}
|
||||
|
||||
|
||||
,
|
||||
"output_var_obs_mean": {
|
||||
"type":
|
||||
"string",
|
||||
"description": "Type: `string`, default: `obs_mean`. Name of the column in ",
|
||||
"help_text": "Type: `string`, default: `obs_mean`. Name of the column in .obs providing the mean of the values in each row.\n"
|
||||
,
|
||||
"default":"obs_mean"
|
||||
}
|
||||
|
||||
|
||||
,
|
||||
"output_var_pct_dropout": {
|
||||
"type":
|
||||
"string",
|
||||
"description": "Type: `string`, default: `pct_dropout`. Name of the column in ",
|
||||
"help_text": "Type: `string`, default: `pct_dropout`. Name of the column in .obs providing for each feature the percentage of\nobservations the feature does not appear on (i.e. is missing). Same as `--num_nonzero_obs`\nbut percentage based.\n"
|
||||
,
|
||||
"default":"pct_dropout"
|
||||
}
|
||||
|
||||
|
||||
}
|
||||
},
|
||||
|
||||
|
||||
"nextflow input-output arguments" : {
|
||||
"title": "Nextflow input-output arguments",
|
||||
"type": "object",
|
||||
"description": "Input/output parameters for Nextflow itself. Please note that both publishDir and publish_dir are supported but at least one has to be configured.",
|
||||
"properties": {
|
||||
|
||||
|
||||
"publish_dir": {
|
||||
"type":
|
||||
"string",
|
||||
"description": "Type: `string`, required, example: `output/`. Path to an output directory",
|
||||
"help_text": "Type: `string`, required, example: `output/`. Path to an output directory."
|
||||
|
||||
}
|
||||
|
||||
|
||||
|
||||
|
||||
}
|
||||
}
|
||||
},
|
||||
"allOf": [
|
||||
|
||||
{
|
||||
"$ref": "#/definitions/inputs"
|
||||
},
|
||||
|
||||
{
|
||||
"$ref": "#/definitions/outputs"
|
||||
},
|
||||
|
||||
{
|
||||
"$ref": "#/definitions/metrics added to .obs"
|
||||
},
|
||||
|
||||
{
|
||||
"$ref": "#/definitions/metrics added to .var"
|
||||
},
|
||||
|
||||
{
|
||||
"$ref": "#/definitions/nextflow input-output arguments"
|
||||
}
|
||||
]
|
||||
}
|
||||
@@ -0,0 +1,12 @@
|
||||
def setup_logger():
|
||||
import logging
|
||||
from sys import stdout
|
||||
|
||||
logger = logging.getLogger()
|
||||
logger.setLevel(logging.INFO)
|
||||
console_handler = logging.StreamHandler(stdout)
|
||||
logFormatter = logging.Formatter("%(asctime)s %(levelname)-8s %(message)s")
|
||||
console_handler.setFormatter(logFormatter)
|
||||
logger.addHandler(console_handler)
|
||||
|
||||
return logger
|
||||
@@ -0,0 +1,406 @@
|
||||
name: "qc"
|
||||
namespace: "workflows/qc"
|
||||
version: "2.1.2"
|
||||
authors:
|
||||
- name: "Dries Schaumont"
|
||||
roles:
|
||||
- "author"
|
||||
- "maintainer"
|
||||
info:
|
||||
role: "Core Team Member"
|
||||
links:
|
||||
email: "dries@data-intuitive.com"
|
||||
github: "DriesSchaumont"
|
||||
orcid: "0000-0002-4389-0440"
|
||||
linkedin: "dries-schaumont"
|
||||
organizations:
|
||||
- name: "Data Intuitive"
|
||||
href: "https://www.data-intuitive.com"
|
||||
role: "Data Scientist"
|
||||
argument_groups:
|
||||
- name: "Inputs"
|
||||
arguments:
|
||||
- type: "string"
|
||||
name: "--id"
|
||||
description: "ID of the sample."
|
||||
info: null
|
||||
example:
|
||||
- "foo"
|
||||
required: true
|
||||
direction: "input"
|
||||
multiple: false
|
||||
multiple_sep: ";"
|
||||
- type: "file"
|
||||
name: "--input"
|
||||
alternatives:
|
||||
- "-i"
|
||||
description: "Path to the sample."
|
||||
info: null
|
||||
example:
|
||||
- "input.h5mu"
|
||||
must_exist: true
|
||||
create_parent: true
|
||||
required: true
|
||||
direction: "input"
|
||||
multiple: false
|
||||
multiple_sep: ";"
|
||||
- type: "string"
|
||||
name: "--modality"
|
||||
description: "Which modality to process."
|
||||
info: null
|
||||
default:
|
||||
- "rna"
|
||||
required: false
|
||||
direction: "input"
|
||||
multiple: false
|
||||
multiple_sep: ";"
|
||||
- type: "string"
|
||||
name: "--layer"
|
||||
description: "Layer to calculate qc metrics for."
|
||||
info: null
|
||||
example:
|
||||
- "raw_counts"
|
||||
required: false
|
||||
direction: "input"
|
||||
multiple: false
|
||||
multiple_sep: ";"
|
||||
- name: "Mitochondrial & Ribosomal Gene Detection"
|
||||
arguments:
|
||||
- type: "string"
|
||||
name: "--var_gene_names"
|
||||
description: ".var column name to be used to detect mitochondrial/ribosomal genes\
|
||||
\ instead of .var_names (default if not set).\nGene names matching with the\
|
||||
\ regex value from --mitochondrial_gene_regex or --ribosomal_gene_regex will\
|
||||
\ be \nidentified as mitochondrial or ribosomal genes, respectively.\n"
|
||||
info: null
|
||||
example:
|
||||
- "gene_symbol"
|
||||
required: false
|
||||
direction: "input"
|
||||
multiple: false
|
||||
multiple_sep: ";"
|
||||
- type: "string"
|
||||
name: "--var_name_mitochondrial_genes"
|
||||
description: "In which .var slot to store a boolean array corresponding the mitochondrial\
|
||||
\ genes.\n"
|
||||
info: null
|
||||
required: false
|
||||
direction: "input"
|
||||
multiple: false
|
||||
multiple_sep: ";"
|
||||
- type: "string"
|
||||
name: "--obs_name_mitochondrial_fraction"
|
||||
description: ".Obs slot to store the fraction of reads found to be mitochondrial.\
|
||||
\ Defaults to 'fraction_' suffixed by the value of --var_name_mitochondrial_genes\n"
|
||||
info: null
|
||||
required: false
|
||||
direction: "input"
|
||||
multiple: false
|
||||
multiple_sep: ";"
|
||||
- type: "string"
|
||||
name: "--mitochondrial_gene_regex"
|
||||
description: "Regex string that identifies mitochondrial genes from --var_gene_names.\n\
|
||||
By default will detect human and mouse mitochondrial genes from a gene symbol.\n"
|
||||
info: null
|
||||
default:
|
||||
- "^[mM][tT]-"
|
||||
required: false
|
||||
direction: "input"
|
||||
multiple: false
|
||||
multiple_sep: ";"
|
||||
- type: "string"
|
||||
name: "--var_name_ribosomal_genes"
|
||||
description: "In which .var slot to store a boolean array corresponding the ribosomal\
|
||||
\ genes.\n"
|
||||
info: null
|
||||
required: false
|
||||
direction: "input"
|
||||
multiple: false
|
||||
multiple_sep: ";"
|
||||
- type: "string"
|
||||
name: "--obs_name_ribosomal_fraction"
|
||||
description: "When specified, write the fraction of counts originating from ribosomal\
|
||||
\ genes \n(based on --ribosomal_gene_regex) to an .obs column with the specified\
|
||||
\ name.\nRequires --var_name_ribosomal_genes.\n"
|
||||
info: null
|
||||
required: false
|
||||
direction: "input"
|
||||
multiple: false
|
||||
multiple_sep: ";"
|
||||
- type: "string"
|
||||
name: "--ribosomal_gene_regex"
|
||||
description: "Regex string that identifies ribosomal genes from --var_gene_names.\n\
|
||||
By default will detect human and mouse ribosomal genes from a gene symbol.\n"
|
||||
info: null
|
||||
default:
|
||||
- "^[Mm]?[Rr][Pp][LlSs]"
|
||||
required: false
|
||||
direction: "input"
|
||||
multiple: false
|
||||
multiple_sep: ";"
|
||||
- name: "QC metrics calculation options"
|
||||
arguments:
|
||||
- type: "string"
|
||||
name: "--var_qc_metrics"
|
||||
description: "Keys to select a boolean (containing only True or False) column\
|
||||
\ from .var.\nFor each cell, calculate the proportion of total values for genes\
|
||||
\ which are labeled 'True', \ncompared to the total sum of the values for all\
|
||||
\ genes. Defaults to the value from\n--var_name_mitochondrial_genes.\n"
|
||||
info: null
|
||||
example:
|
||||
- "ercc,highly_variable"
|
||||
required: false
|
||||
direction: "input"
|
||||
multiple: true
|
||||
multiple_sep: ","
|
||||
- type: "integer"
|
||||
name: "--top_n_vars"
|
||||
description: "Number of top vars to be used to calculate cumulative proportions.\n\
|
||||
If not specified, proportions are not calculated. `--top_n_vars 20,50` finds\n\
|
||||
cumulative proportion to the 20th and 50th most expressed vars.\n"
|
||||
info: null
|
||||
default:
|
||||
- 50
|
||||
- 100
|
||||
- 200
|
||||
- 500
|
||||
required: false
|
||||
direction: "input"
|
||||
multiple: true
|
||||
multiple_sep: ","
|
||||
- type: "string"
|
||||
name: "--output_obs_num_nonzero_vars"
|
||||
description: "Name of column in .obs describing, for each observation, the number\
|
||||
\ of stored values\n(including explicit zeroes). In other words, the name of\
|
||||
\ the column that counts\nfor each row the number of columns that contain data.\n"
|
||||
info: null
|
||||
default:
|
||||
- "num_nonzero_vars"
|
||||
required: false
|
||||
direction: "input"
|
||||
multiple: false
|
||||
multiple_sep: ";"
|
||||
- type: "string"
|
||||
name: "--output_obs_total_counts_vars"
|
||||
description: "Name of the column for .obs describing, for each observation (row),\n\
|
||||
the sum of the stored values in the columns.\n"
|
||||
info: null
|
||||
default:
|
||||
- "total_counts"
|
||||
required: false
|
||||
direction: "input"
|
||||
multiple: false
|
||||
multiple_sep: ";"
|
||||
- type: "string"
|
||||
name: "--output_var_num_nonzero_obs"
|
||||
description: "Name of column describing, for each feature, the number of stored\
|
||||
\ values\n(including explicit zeroes). In other words, the name of the column\
|
||||
\ that counts\nfor each column the number of rows that contain data.\n"
|
||||
info: null
|
||||
default:
|
||||
- "num_nonzero_obs"
|
||||
required: false
|
||||
direction: "input"
|
||||
multiple: false
|
||||
multiple_sep: ";"
|
||||
- type: "string"
|
||||
name: "--output_var_total_counts_obs"
|
||||
description: "Name of the column in .var describing, for each feature (column),\n\
|
||||
the sum of the stored values in the rows.\n"
|
||||
info: null
|
||||
default:
|
||||
- "total_counts"
|
||||
required: false
|
||||
direction: "input"
|
||||
multiple: false
|
||||
multiple_sep: ";"
|
||||
- type: "string"
|
||||
name: "--output_var_obs_mean"
|
||||
description: "Name of the column in .obs providing the mean of the values in each\
|
||||
\ row.\n"
|
||||
info: null
|
||||
default:
|
||||
- "obs_mean"
|
||||
required: false
|
||||
direction: "input"
|
||||
multiple: false
|
||||
multiple_sep: ";"
|
||||
- type: "string"
|
||||
name: "--output_var_pct_dropout"
|
||||
description: "Name of the column in .obs providing for each feature the percentage\
|
||||
\ of\nobservations the feature does not appear on (i.e. is missing). Same as\
|
||||
\ `--output_var_num_nonzero_obs`\nbut percentage based.\n"
|
||||
info: null
|
||||
default:
|
||||
- "pct_dropout"
|
||||
required: false
|
||||
direction: "input"
|
||||
multiple: false
|
||||
multiple_sep: ";"
|
||||
- name: "Outputs"
|
||||
arguments:
|
||||
- type: "file"
|
||||
name: "--output"
|
||||
description: "Destination path to the output."
|
||||
info: null
|
||||
example:
|
||||
- "output.h5mu"
|
||||
must_exist: true
|
||||
create_parent: true
|
||||
required: true
|
||||
direction: "output"
|
||||
multiple: false
|
||||
multiple_sep: ";"
|
||||
resources:
|
||||
- type: "nextflow_script"
|
||||
path: "main.nf"
|
||||
is_executable: true
|
||||
entrypoint: "run_wf"
|
||||
- type: "file"
|
||||
path: "utils"
|
||||
- type: "file"
|
||||
path: "nextflow_labels.config"
|
||||
dest: "nextflow_labels.config"
|
||||
description: "A pipeline to add basic qc statistics to a MuData "
|
||||
test_resources:
|
||||
- type: "nextflow_script"
|
||||
path: "test.nf"
|
||||
is_executable: true
|
||||
entrypoint: "test_wf"
|
||||
- type: "file"
|
||||
path: "concat_test_data"
|
||||
- type: "file"
|
||||
path: "pbmc_1k_protein_v3"
|
||||
info:
|
||||
test_dependencies:
|
||||
- name: "qc_test"
|
||||
namespace: "test_workflows/qc"
|
||||
status: "enabled"
|
||||
scope:
|
||||
image: "public"
|
||||
target: "public"
|
||||
dependencies:
|
||||
- name: "metadata/grep_annotation_column"
|
||||
repository:
|
||||
type: "local"
|
||||
- name: "qc/calculate_qc_metrics"
|
||||
repository:
|
||||
type: "local"
|
||||
license: "MIT"
|
||||
links:
|
||||
repository: "https://github.com/openpipelines-bio/openpipeline"
|
||||
docker_registry: "ghcr.io"
|
||||
runners:
|
||||
- type: "nextflow"
|
||||
id: "nextflow"
|
||||
directives:
|
||||
tag: "$id"
|
||||
auto:
|
||||
simplifyInput: true
|
||||
simplifyOutput: false
|
||||
transcript: false
|
||||
publish: false
|
||||
config:
|
||||
labels:
|
||||
mem1gb: "memory = 1000000000.B"
|
||||
mem2gb: "memory = 2000000000.B"
|
||||
mem5gb: "memory = 5000000000.B"
|
||||
mem10gb: "memory = 10000000000.B"
|
||||
mem20gb: "memory = 20000000000.B"
|
||||
mem50gb: "memory = 50000000000.B"
|
||||
mem100gb: "memory = 100000000000.B"
|
||||
mem200gb: "memory = 200000000000.B"
|
||||
mem500gb: "memory = 500000000000.B"
|
||||
mem1tb: "memory = 1000000000000.B"
|
||||
mem2tb: "memory = 2000000000000.B"
|
||||
mem5tb: "memory = 5000000000000.B"
|
||||
mem10tb: "memory = 10000000000000.B"
|
||||
mem20tb: "memory = 20000000000000.B"
|
||||
mem50tb: "memory = 50000000000000.B"
|
||||
mem100tb: "memory = 100000000000000.B"
|
||||
mem200tb: "memory = 200000000000000.B"
|
||||
mem500tb: "memory = 500000000000000.B"
|
||||
mem1gib: "memory = 1073741824.B"
|
||||
mem2gib: "memory = 2147483648.B"
|
||||
mem4gib: "memory = 4294967296.B"
|
||||
mem8gib: "memory = 8589934592.B"
|
||||
mem16gib: "memory = 17179869184.B"
|
||||
mem32gib: "memory = 34359738368.B"
|
||||
mem64gib: "memory = 68719476736.B"
|
||||
mem128gib: "memory = 137438953472.B"
|
||||
mem256gib: "memory = 274877906944.B"
|
||||
mem512gib: "memory = 549755813888.B"
|
||||
mem1tib: "memory = 1099511627776.B"
|
||||
mem2tib: "memory = 2199023255552.B"
|
||||
mem4tib: "memory = 4398046511104.B"
|
||||
mem8tib: "memory = 8796093022208.B"
|
||||
mem16tib: "memory = 17592186044416.B"
|
||||
mem32tib: "memory = 35184372088832.B"
|
||||
mem64tib: "memory = 70368744177664.B"
|
||||
mem128tib: "memory = 140737488355328.B"
|
||||
mem256tib: "memory = 281474976710656.B"
|
||||
mem512tib: "memory = 562949953421312.B"
|
||||
cpu1: "cpus = 1"
|
||||
cpu2: "cpus = 2"
|
||||
cpu5: "cpus = 5"
|
||||
cpu10: "cpus = 10"
|
||||
cpu20: "cpus = 20"
|
||||
cpu50: "cpus = 50"
|
||||
cpu100: "cpus = 100"
|
||||
cpu200: "cpus = 200"
|
||||
cpu500: "cpus = 500"
|
||||
cpu1000: "cpus = 1000"
|
||||
script:
|
||||
- "includeConfig(\"nextflow_labels.config\")"
|
||||
debug: false
|
||||
container: "docker"
|
||||
build_info:
|
||||
config: "src/workflows/qc/qc/config.vsh.yaml"
|
||||
runner: "nextflow"
|
||||
engine: "native"
|
||||
output: "target/nextflow/workflows/qc/qc"
|
||||
executable: "target/nextflow/workflows/qc/qc/main.nf"
|
||||
viash_version: "0.9.4"
|
||||
git_commit: "a0c9522486585774f76416150f8a3291409b5363"
|
||||
git_remote: "https://github.com/openpipelines-bio/openpipeline"
|
||||
git_tag: "2.1.1-2-ga0c95224865"
|
||||
dependencies:
|
||||
- "target/nextflow/metadata/grep_annotation_column"
|
||||
- "target/nextflow/qc/calculate_qc_metrics"
|
||||
package_config:
|
||||
name: "openpipeline"
|
||||
summary: "Best-practice workflows for single-cell multi-omics analyses.\n"
|
||||
description: "OpenPipelines are extensible single cell analysis pipelines for reproducible\
|
||||
\ and large-scale single cell processing using [Viash](https://viash.io) and [Nextflow](https://www.nextflow.io/).\n\
|
||||
\nIn terms of workflows, the following has been made available, but keep in mind\
|
||||
\ that\nindividual tools and functionality can be executed as standalone components\
|
||||
\ as well.\n\n * Demultiplexing: conversion of raw sequencing data to FASTQ objects.\n\
|
||||
\ * Ingestion: Read mapping and generating a count matrix.\n * Single sample\
|
||||
\ processing: cell filtering and doublet detection.\n * Multisample processing:\
|
||||
\ Count transformation, normalization, QC metric calulations.\n * Integration:\
|
||||
\ Clustering, integration and batch correction using single and multimodal methods.\n\
|
||||
\ * Downstream analysis workflows\n"
|
||||
info:
|
||||
test_resources:
|
||||
- type: "s3"
|
||||
path: "s3://openpipelines-data"
|
||||
dest: "resources_test"
|
||||
viash_version: "0.9.4"
|
||||
source: "src"
|
||||
target: "target"
|
||||
config_mods:
|
||||
- ".resources += {path: '/src/workflows/utils/labels.config', dest: 'nextflow_labels.config'}\n\
|
||||
.runners[.type == 'nextflow'].config.script := 'includeConfig(\"nextflow_labels.config\"\
|
||||
)'"
|
||||
- ".version := \"2.1.2\""
|
||||
keywords:
|
||||
- "single-cell"
|
||||
- "multimodal"
|
||||
license: "MIT"
|
||||
organization: "openpipelines-bio"
|
||||
links:
|
||||
repository: "https://github.com/openpipelines-bio/openpipeline"
|
||||
docker_registry: "ghcr.io"
|
||||
homepage: "https://openpipelines.bio"
|
||||
documentation: "https://openpipelines.bio/fundamentals"
|
||||
issue_tracker: "https://github.com/openpipelines-bio/openpipeline/issues"
|
||||
File diff suppressed because it is too large
Load Diff
@@ -0,0 +1,126 @@
|
||||
manifest {
|
||||
name = 'workflows/qc/qc'
|
||||
mainScript = 'main.nf'
|
||||
nextflowVersion = '!>=20.12.1-edge'
|
||||
version = '2.1.2'
|
||||
description = 'A pipeline to add basic qc statistics to a MuData '
|
||||
author = 'Dries Schaumont'
|
||||
}
|
||||
|
||||
process.container = 'nextflow/bash:latest'
|
||||
|
||||
// detect tempdir
|
||||
tempDir = java.nio.file.Paths.get(
|
||||
System.getenv('NXF_TEMP') ?:
|
||||
System.getenv('VIASH_TEMP') ?:
|
||||
System.getenv('TEMPDIR') ?:
|
||||
System.getenv('TMPDIR') ?:
|
||||
'/tmp'
|
||||
).toAbsolutePath()
|
||||
|
||||
profiles {
|
||||
no_publish {
|
||||
process {
|
||||
withName: '.*' {
|
||||
publishDir = [
|
||||
enabled: false
|
||||
]
|
||||
}
|
||||
}
|
||||
}
|
||||
mount_temp {
|
||||
docker.temp = tempDir
|
||||
podman.temp = tempDir
|
||||
charliecloud.temp = tempDir
|
||||
}
|
||||
docker {
|
||||
docker.enabled = true
|
||||
// docker.userEmulation = true
|
||||
singularity.enabled = false
|
||||
podman.enabled = false
|
||||
shifter.enabled = false
|
||||
charliecloud.enabled = false
|
||||
}
|
||||
singularity {
|
||||
singularity.enabled = true
|
||||
singularity.autoMounts = true
|
||||
docker.enabled = false
|
||||
podman.enabled = false
|
||||
shifter.enabled = false
|
||||
charliecloud.enabled = false
|
||||
}
|
||||
podman {
|
||||
podman.enabled = true
|
||||
docker.enabled = false
|
||||
singularity.enabled = false
|
||||
shifter.enabled = false
|
||||
charliecloud.enabled = false
|
||||
}
|
||||
shifter {
|
||||
shifter.enabled = true
|
||||
docker.enabled = false
|
||||
singularity.enabled = false
|
||||
podman.enabled = false
|
||||
charliecloud.enabled = false
|
||||
}
|
||||
charliecloud {
|
||||
charliecloud.enabled = true
|
||||
docker.enabled = false
|
||||
singularity.enabled = false
|
||||
podman.enabled = false
|
||||
shifter.enabled = false
|
||||
}
|
||||
}
|
||||
|
||||
process{
|
||||
withLabel: mem1gb { memory = 1000000000.B }
|
||||
withLabel: mem2gb { memory = 2000000000.B }
|
||||
withLabel: mem5gb { memory = 5000000000.B }
|
||||
withLabel: mem10gb { memory = 10000000000.B }
|
||||
withLabel: mem20gb { memory = 20000000000.B }
|
||||
withLabel: mem50gb { memory = 50000000000.B }
|
||||
withLabel: mem100gb { memory = 100000000000.B }
|
||||
withLabel: mem200gb { memory = 200000000000.B }
|
||||
withLabel: mem500gb { memory = 500000000000.B }
|
||||
withLabel: mem1tb { memory = 1000000000000.B }
|
||||
withLabel: mem2tb { memory = 2000000000000.B }
|
||||
withLabel: mem5tb { memory = 5000000000000.B }
|
||||
withLabel: mem10tb { memory = 10000000000000.B }
|
||||
withLabel: mem20tb { memory = 20000000000000.B }
|
||||
withLabel: mem50tb { memory = 50000000000000.B }
|
||||
withLabel: mem100tb { memory = 100000000000000.B }
|
||||
withLabel: mem200tb { memory = 200000000000000.B }
|
||||
withLabel: mem500tb { memory = 500000000000000.B }
|
||||
withLabel: mem1gib { memory = 1073741824.B }
|
||||
withLabel: mem2gib { memory = 2147483648.B }
|
||||
withLabel: mem4gib { memory = 4294967296.B }
|
||||
withLabel: mem8gib { memory = 8589934592.B }
|
||||
withLabel: mem16gib { memory = 17179869184.B }
|
||||
withLabel: mem32gib { memory = 34359738368.B }
|
||||
withLabel: mem64gib { memory = 68719476736.B }
|
||||
withLabel: mem128gib { memory = 137438953472.B }
|
||||
withLabel: mem256gib { memory = 274877906944.B }
|
||||
withLabel: mem512gib { memory = 549755813888.B }
|
||||
withLabel: mem1tib { memory = 1099511627776.B }
|
||||
withLabel: mem2tib { memory = 2199023255552.B }
|
||||
withLabel: mem4tib { memory = 4398046511104.B }
|
||||
withLabel: mem8tib { memory = 8796093022208.B }
|
||||
withLabel: mem16tib { memory = 17592186044416.B }
|
||||
withLabel: mem32tib { memory = 35184372088832.B }
|
||||
withLabel: mem64tib { memory = 70368744177664.B }
|
||||
withLabel: mem128tib { memory = 140737488355328.B }
|
||||
withLabel: mem256tib { memory = 281474976710656.B }
|
||||
withLabel: mem512tib { memory = 562949953421312.B }
|
||||
withLabel: cpu1 { cpus = 1 }
|
||||
withLabel: cpu2 { cpus = 2 }
|
||||
withLabel: cpu5 { cpus = 5 }
|
||||
withLabel: cpu10 { cpus = 10 }
|
||||
withLabel: cpu20 { cpus = 20 }
|
||||
withLabel: cpu50 { cpus = 50 }
|
||||
withLabel: cpu100 { cpus = 100 }
|
||||
withLabel: cpu200 { cpus = 200 }
|
||||
withLabel: cpu500 { cpus = 500 }
|
||||
withLabel: cpu1000 { cpus = 1000 }
|
||||
}
|
||||
|
||||
includeConfig("nextflow_labels.config")
|
||||
@@ -0,0 +1,66 @@
|
||||
process {
|
||||
// Default resources for components that hardly do any processing
|
||||
memory = { 2.GB * task.attempt }
|
||||
cpus = 1
|
||||
|
||||
// Retry for exit codes that have something to do with memory issues
|
||||
errorStrategy = { task.exitStatus in 137..140 ? 'retry' : 'terminate' }
|
||||
maxRetries = 3
|
||||
maxMemory = null
|
||||
|
||||
// CPU resources
|
||||
withLabel: singlecpu { cpus = 1 }
|
||||
withLabel: lowcpu { cpus = 4 }
|
||||
withLabel: midcpu { cpus = 10 }
|
||||
withLabel: highcpu { cpus = 20 }
|
||||
|
||||
// Memory resources
|
||||
withLabel: lowmem { memory = { get_memory( 4.GB * task.attempt ) } }
|
||||
withLabel: midmem { memory = { get_memory( 25.GB * task.attempt ) } }
|
||||
withLabel: highmem { memory = { get_memory( 50.GB * task.attempt ) } }
|
||||
withLabel: veryhighmem { memory = { get_memory( 75.GB * task.attempt ) } }
|
||||
|
||||
// Disk space
|
||||
withLabel: lowdisk {
|
||||
disk = {process.disk ? process.disk : null}
|
||||
}
|
||||
withLabel: middisk {
|
||||
disk = {process.disk ? process.disk : null}
|
||||
}
|
||||
withLabel: highdisk {
|
||||
disk = {process.disk ? process.disk : null}
|
||||
}
|
||||
withLabel: veryhighdisk {
|
||||
disk = {process.disk ? process.disk : null}
|
||||
}
|
||||
|
||||
// NOTE: The above labels intentionally do not have an effect by default.
|
||||
// The user should set the disk space requirements by adding the following
|
||||
// to the compute environment:
|
||||
//
|
||||
// withLabel: lowdisk { disk = { 20.GB * task.attempt } }
|
||||
// withLabel: middisk { disk = { 100.GB * task.attempt } }
|
||||
// withLabel: highdisk { disk = { 200.GB * task.attempt } }
|
||||
// withLabel: veryhighdisk { disk = { 500.GB * task.attempt } }
|
||||
}
|
||||
|
||||
def get_memory(to_compare) {
|
||||
if (!process.containsKey("maxMemory") || !process.maxMemory) {
|
||||
return to_compare
|
||||
}
|
||||
|
||||
try {
|
||||
if (process.containsKey("maxRetries") && process.maxRetries && task.attempt == (process.maxRetries as int)) {
|
||||
return process.maxMemory
|
||||
}
|
||||
else if (to_compare.compareTo(process.maxMemory as nextflow.util.MemoryUnit) == 1) {
|
||||
return max_memory as nextflow.util.MemoryUnit
|
||||
}
|
||||
else {
|
||||
return to_compare
|
||||
}
|
||||
} catch (all) {
|
||||
println "Error processing memory resources. Please check that process.maxMemory '${process.maxMemory}' and process.maxRetries '${process.maxRetries}' are valid!"
|
||||
System.exit(1)
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,33 @@
|
||||
# Inputs
|
||||
id: # please fill in - example: "foo"
|
||||
input: # please fill in - example: "input.h5mu"
|
||||
modality: "rna"
|
||||
# layer: "raw_counts"
|
||||
|
||||
# Mitochondrial & Ribosomal Gene Detection
|
||||
# var_gene_names: "gene_symbol"
|
||||
# var_name_mitochondrial_genes: "foo"
|
||||
# obs_name_mitochondrial_fraction: "foo"
|
||||
mitochondrial_gene_regex: "^[mM][tT]-"
|
||||
# var_name_ribosomal_genes: "foo"
|
||||
# obs_name_ribosomal_fraction: "foo"
|
||||
ribosomal_gene_regex: "^[Mm]?[Rr][Pp][LlSs]"
|
||||
|
||||
# QC metrics calculation options
|
||||
# var_qc_metrics: ["ercc,highly_variable"]
|
||||
top_n_vars: [50, 100, 200, 500]
|
||||
output_obs_num_nonzero_vars: "num_nonzero_vars"
|
||||
output_obs_total_counts_vars: "total_counts"
|
||||
output_var_num_nonzero_obs: "num_nonzero_obs"
|
||||
output_var_total_counts_obs: "total_counts"
|
||||
output_var_obs_mean: "obs_mean"
|
||||
output_var_pct_dropout: "pct_dropout"
|
||||
|
||||
# Outputs
|
||||
# output: "$id.$key.output.h5mu"
|
||||
|
||||
# Nextflow input-output arguments
|
||||
publish_dir: # please fill in - example: "output/"
|
||||
# param_list: "my_params.yaml"
|
||||
|
||||
# Arguments
|
||||
@@ -0,0 +1,320 @@
|
||||
{
|
||||
"$schema": "http://json-schema.org/draft-07/schema",
|
||||
"title": "qc",
|
||||
"description": "A pipeline to add basic qc statistics to a MuData ",
|
||||
"type": "object",
|
||||
"definitions": {
|
||||
|
||||
|
||||
"Dataset input": {
|
||||
"title": "Dataset input",
|
||||
"type": "object",
|
||||
"description": "Dataset input using nf-tower \"dataset\" or \"data explorer\". Allows for the input of multiple parameter sets to initialise a Nextflow channel.",
|
||||
"properties": {
|
||||
"param_list": {
|
||||
"description": "Dataset input can either be a list of maps, a csv file, a json file, a yaml file, or simply a yaml blob. The names of the input fields (e.g. csv columns, json keys) need to be an exact match with the workflow input parameters.",
|
||||
"default": "",
|
||||
"format": "file-path",
|
||||
"mimetype": "text/csv",
|
||||
"pattern": "^\\S+\\.csv$"
|
||||
}
|
||||
}
|
||||
},
|
||||
|
||||
|
||||
|
||||
"inputs" : {
|
||||
"title": "Inputs",
|
||||
"type": "object",
|
||||
"description": "No description",
|
||||
"properties": {
|
||||
|
||||
|
||||
"id": {
|
||||
"type":
|
||||
"string",
|
||||
"description": "Type: `string`, required, example: `foo`. ID of the sample",
|
||||
"help_text": "Type: `string`, required, example: `foo`. ID of the sample."
|
||||
|
||||
}
|
||||
|
||||
|
||||
,
|
||||
"input": {
|
||||
"type":
|
||||
"string",
|
||||
"description": "Type: `file`, required, example: `input.h5mu`. Path to the sample",
|
||||
"help_text": "Type: `file`, required, example: `input.h5mu`. Path to the sample."
|
||||
|
||||
}
|
||||
|
||||
|
||||
,
|
||||
"modality": {
|
||||
"type":
|
||||
"string",
|
||||
"description": "Type: `string`, default: `rna`. Which modality to process",
|
||||
"help_text": "Type: `string`, default: `rna`. Which modality to process."
|
||||
,
|
||||
"default":"rna"
|
||||
}
|
||||
|
||||
|
||||
,
|
||||
"layer": {
|
||||
"type":
|
||||
"string",
|
||||
"description": "Type: `string`, example: `raw_counts`. Layer to calculate qc metrics for",
|
||||
"help_text": "Type: `string`, example: `raw_counts`. Layer to calculate qc metrics for."
|
||||
|
||||
}
|
||||
|
||||
|
||||
}
|
||||
},
|
||||
|
||||
|
||||
"outputs" : {
|
||||
"title": "Outputs",
|
||||
"type": "object",
|
||||
"description": "No description",
|
||||
"properties": {
|
||||
|
||||
|
||||
"output": {
|
||||
"type":
|
||||
"string",
|
||||
"description": "Type: `file`, required, default: `$id.$key.output.h5mu`, example: `output.h5mu`. Destination path to the output",
|
||||
"help_text": "Type: `file`, required, default: `$id.$key.output.h5mu`, example: `output.h5mu`. Destination path to the output."
|
||||
,
|
||||
"default":"$id.$key.output.h5mu"
|
||||
}
|
||||
|
||||
|
||||
}
|
||||
},
|
||||
|
||||
|
||||
"mitochondrial & ribosomal gene detection" : {
|
||||
"title": "Mitochondrial & Ribosomal Gene Detection",
|
||||
"type": "object",
|
||||
"description": "No description",
|
||||
"properties": {
|
||||
|
||||
|
||||
"var_gene_names": {
|
||||
"type":
|
||||
"string",
|
||||
"description": "Type: `string`, example: `gene_symbol`. ",
|
||||
"help_text": "Type: `string`, example: `gene_symbol`. .var column name to be used to detect mitochondrial/ribosomal genes instead of .var_names (default if not set).\nGene names matching with the regex value from --mitochondrial_gene_regex or --ribosomal_gene_regex will be \nidentified as mitochondrial or ribosomal genes, respectively.\n"
|
||||
|
||||
}
|
||||
|
||||
|
||||
,
|
||||
"var_name_mitochondrial_genes": {
|
||||
"type":
|
||||
"string",
|
||||
"description": "Type: `string`. In which ",
|
||||
"help_text": "Type: `string`. In which .var slot to store a boolean array corresponding the mitochondrial genes.\n"
|
||||
|
||||
}
|
||||
|
||||
|
||||
,
|
||||
"obs_name_mitochondrial_fraction": {
|
||||
"type":
|
||||
"string",
|
||||
"description": "Type: `string`. ",
|
||||
"help_text": "Type: `string`. .Obs slot to store the fraction of reads found to be mitochondrial. Defaults to \u0027fraction_\u0027 suffixed by the value of --var_name_mitochondrial_genes\n"
|
||||
|
||||
}
|
||||
|
||||
|
||||
,
|
||||
"mitochondrial_gene_regex": {
|
||||
"type":
|
||||
"string",
|
||||
"description": "Type: `string`, default: `^[mM][tT]-`. Regex string that identifies mitochondrial genes from --var_gene_names",
|
||||
"help_text": "Type: `string`, default: `^[mM][tT]-`. Regex string that identifies mitochondrial genes from --var_gene_names.\nBy default will detect human and mouse mitochondrial genes from a gene symbol.\n"
|
||||
,
|
||||
"default":"^[mM][tT]-"
|
||||
}
|
||||
|
||||
|
||||
,
|
||||
"var_name_ribosomal_genes": {
|
||||
"type":
|
||||
"string",
|
||||
"description": "Type: `string`. In which ",
|
||||
"help_text": "Type: `string`. In which .var slot to store a boolean array corresponding the ribosomal genes.\n"
|
||||
|
||||
}
|
||||
|
||||
|
||||
,
|
||||
"obs_name_ribosomal_fraction": {
|
||||
"type":
|
||||
"string",
|
||||
"description": "Type: `string`. When specified, write the fraction of counts originating from ribosomal genes \n(based on --ribosomal_gene_regex) to an ",
|
||||
"help_text": "Type: `string`. When specified, write the fraction of counts originating from ribosomal genes \n(based on --ribosomal_gene_regex) to an .obs column with the specified name.\nRequires --var_name_ribosomal_genes.\n"
|
||||
|
||||
}
|
||||
|
||||
|
||||
,
|
||||
"ribosomal_gene_regex": {
|
||||
"type":
|
||||
"string",
|
||||
"description": "Type: `string`, default: `^[Mm]?[Rr][Pp][LlSs]`. Regex string that identifies ribosomal genes from --var_gene_names",
|
||||
"help_text": "Type: `string`, default: `^[Mm]?[Rr][Pp][LlSs]`. Regex string that identifies ribosomal genes from --var_gene_names.\nBy default will detect human and mouse ribosomal genes from a gene symbol.\n"
|
||||
,
|
||||
"default":"^[Mm]?[Rr][Pp][LlSs]"
|
||||
}
|
||||
|
||||
|
||||
}
|
||||
},
|
||||
|
||||
|
||||
"qc metrics calculation options" : {
|
||||
"title": "QC metrics calculation options",
|
||||
"type": "object",
|
||||
"description": "No description",
|
||||
"properties": {
|
||||
|
||||
|
||||
"var_qc_metrics": {
|
||||
"type":
|
||||
"string",
|
||||
"description": "Type: List of `string`, example: `ercc,highly_variable`, multiple_sep: `\",\"`. Keys to select a boolean (containing only True or False) column from ",
|
||||
"help_text": "Type: List of `string`, example: `ercc,highly_variable`, multiple_sep: `\",\"`. Keys to select a boolean (containing only True or False) column from .var.\nFor each cell, calculate the proportion of total values for genes which are labeled \u0027True\u0027, \ncompared to the total sum of the values for all genes. Defaults to the value from\n--var_name_mitochondrial_genes.\n"
|
||||
|
||||
}
|
||||
|
||||
|
||||
,
|
||||
"top_n_vars": {
|
||||
"type":
|
||||
"string",
|
||||
"description": "Type: List of `integer`, default: `50,100,200,500`, multiple_sep: `\",\"`. Number of top vars to be used to calculate cumulative proportions",
|
||||
"help_text": "Type: List of `integer`, default: `50,100,200,500`, multiple_sep: `\",\"`. Number of top vars to be used to calculate cumulative proportions.\nIf not specified, proportions are not calculated. `--top_n_vars 20,50` finds\ncumulative proportion to the 20th and 50th most expressed vars.\n"
|
||||
,
|
||||
"default":"50,100,200,500"
|
||||
}
|
||||
|
||||
|
||||
,
|
||||
"output_obs_num_nonzero_vars": {
|
||||
"type":
|
||||
"string",
|
||||
"description": "Type: `string`, default: `num_nonzero_vars`. Name of column in ",
|
||||
"help_text": "Type: `string`, default: `num_nonzero_vars`. Name of column in .obs describing, for each observation, the number of stored values\n(including explicit zeroes). In other words, the name of the column that counts\nfor each row the number of columns that contain data.\n"
|
||||
,
|
||||
"default":"num_nonzero_vars"
|
||||
}
|
||||
|
||||
|
||||
,
|
||||
"output_obs_total_counts_vars": {
|
||||
"type":
|
||||
"string",
|
||||
"description": "Type: `string`, default: `total_counts`. Name of the column for ",
|
||||
"help_text": "Type: `string`, default: `total_counts`. Name of the column for .obs describing, for each observation (row),\nthe sum of the stored values in the columns.\n"
|
||||
,
|
||||
"default":"total_counts"
|
||||
}
|
||||
|
||||
|
||||
,
|
||||
"output_var_num_nonzero_obs": {
|
||||
"type":
|
||||
"string",
|
||||
"description": "Type: `string`, default: `num_nonzero_obs`. Name of column describing, for each feature, the number of stored values\n(including explicit zeroes)",
|
||||
"help_text": "Type: `string`, default: `num_nonzero_obs`. Name of column describing, for each feature, the number of stored values\n(including explicit zeroes). In other words, the name of the column that counts\nfor each column the number of rows that contain data.\n"
|
||||
,
|
||||
"default":"num_nonzero_obs"
|
||||
}
|
||||
|
||||
|
||||
,
|
||||
"output_var_total_counts_obs": {
|
||||
"type":
|
||||
"string",
|
||||
"description": "Type: `string`, default: `total_counts`. Name of the column in ",
|
||||
"help_text": "Type: `string`, default: `total_counts`. Name of the column in .var describing, for each feature (column),\nthe sum of the stored values in the rows.\n"
|
||||
,
|
||||
"default":"total_counts"
|
||||
}
|
||||
|
||||
|
||||
,
|
||||
"output_var_obs_mean": {
|
||||
"type":
|
||||
"string",
|
||||
"description": "Type: `string`, default: `obs_mean`. Name of the column in ",
|
||||
"help_text": "Type: `string`, default: `obs_mean`. Name of the column in .obs providing the mean of the values in each row.\n"
|
||||
,
|
||||
"default":"obs_mean"
|
||||
}
|
||||
|
||||
|
||||
,
|
||||
"output_var_pct_dropout": {
|
||||
"type":
|
||||
"string",
|
||||
"description": "Type: `string`, default: `pct_dropout`. Name of the column in ",
|
||||
"help_text": "Type: `string`, default: `pct_dropout`. Name of the column in .obs providing for each feature the percentage of\nobservations the feature does not appear on (i.e. is missing). Same as `--output_var_num_nonzero_obs`\nbut percentage based.\n"
|
||||
,
|
||||
"default":"pct_dropout"
|
||||
}
|
||||
|
||||
|
||||
}
|
||||
},
|
||||
|
||||
|
||||
"nextflow input-output arguments" : {
|
||||
"title": "Nextflow input-output arguments",
|
||||
"type": "object",
|
||||
"description": "Input/output parameters for Nextflow itself. Please note that both publishDir and publish_dir are supported but at least one has to be configured.",
|
||||
"properties": {
|
||||
|
||||
|
||||
"publish_dir": {
|
||||
"type":
|
||||
"string",
|
||||
"description": "Type: `string`, required, example: `output/`. Path to an output directory",
|
||||
"help_text": "Type: `string`, required, example: `output/`. Path to an output directory."
|
||||
|
||||
}
|
||||
|
||||
|
||||
|
||||
|
||||
}
|
||||
}
|
||||
},
|
||||
"allOf": [
|
||||
|
||||
{
|
||||
"$ref": "#/definitions/inputs"
|
||||
},
|
||||
|
||||
{
|
||||
"$ref": "#/definitions/outputs"
|
||||
},
|
||||
|
||||
{
|
||||
"$ref": "#/definitions/mitochondrial & ribosomal gene detection"
|
||||
},
|
||||
|
||||
{
|
||||
"$ref": "#/definitions/qc metrics calculation options"
|
||||
},
|
||||
|
||||
{
|
||||
"$ref": "#/definitions/nextflow input-output arguments"
|
||||
}
|
||||
]
|
||||
}
|
||||
@@ -0,0 +1 @@
|
||||
process.errorStrategy = 'ignore'
|
||||
@@ -0,0 +1,36 @@
|
||||
profiles {
|
||||
|
||||
// detect tempdir
|
||||
tempDir = java.nio.file.Paths.get(
|
||||
System.getenv('NXF_TEMP') ?:
|
||||
System.getenv('VIASH_TEMP') ?:
|
||||
System.getenv('TEMPDIR') ?:
|
||||
System.getenv('TMPDIR') ?:
|
||||
'/tmp'
|
||||
).toAbsolutePath()
|
||||
|
||||
mount_temp {
|
||||
docker.temp = tempDir
|
||||
podman.temp = tempDir
|
||||
charliecloud.temp = tempDir
|
||||
}
|
||||
|
||||
no_publish {
|
||||
process {
|
||||
withName: '.*' {
|
||||
publishDir = [
|
||||
enabled: false
|
||||
]
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
docker {
|
||||
docker.enabled = true
|
||||
// docker.userEmulation = true
|
||||
singularity.enabled = false
|
||||
podman.enabled = false
|
||||
shifter.enabled = false
|
||||
charliecloud.enabled = false
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,66 @@
|
||||
process {
|
||||
// Default resources for components that hardly do any processing
|
||||
memory = { 2.GB * task.attempt }
|
||||
cpus = 1
|
||||
|
||||
// Retry for exit codes that have something to do with memory issues
|
||||
errorStrategy = { task.exitStatus in 137..140 ? 'retry' : 'terminate' }
|
||||
maxRetries = 3
|
||||
maxMemory = null
|
||||
|
||||
// CPU resources
|
||||
withLabel: singlecpu { cpus = 1 }
|
||||
withLabel: lowcpu { cpus = 4 }
|
||||
withLabel: midcpu { cpus = 10 }
|
||||
withLabel: highcpu { cpus = 20 }
|
||||
|
||||
// Memory resources
|
||||
withLabel: lowmem { memory = { get_memory( 4.GB * task.attempt ) } }
|
||||
withLabel: midmem { memory = { get_memory( 25.GB * task.attempt ) } }
|
||||
withLabel: highmem { memory = { get_memory( 50.GB * task.attempt ) } }
|
||||
withLabel: veryhighmem { memory = { get_memory( 75.GB * task.attempt ) } }
|
||||
|
||||
// Disk space
|
||||
withLabel: lowdisk {
|
||||
disk = {process.disk ? process.disk : null}
|
||||
}
|
||||
withLabel: middisk {
|
||||
disk = {process.disk ? process.disk : null}
|
||||
}
|
||||
withLabel: highdisk {
|
||||
disk = {process.disk ? process.disk : null}
|
||||
}
|
||||
withLabel: veryhighdisk {
|
||||
disk = {process.disk ? process.disk : null}
|
||||
}
|
||||
|
||||
// NOTE: The above labels intentionally do not have an effect by default.
|
||||
// The user should set the disk space requirements by adding the following
|
||||
// to the compute environment:
|
||||
//
|
||||
// withLabel: lowdisk { disk = { 20.GB * task.attempt } }
|
||||
// withLabel: middisk { disk = { 100.GB * task.attempt } }
|
||||
// withLabel: highdisk { disk = { 200.GB * task.attempt } }
|
||||
// withLabel: veryhighdisk { disk = { 500.GB * task.attempt } }
|
||||
}
|
||||
|
||||
def get_memory(to_compare) {
|
||||
if (!process.containsKey("maxMemory") || !process.maxMemory) {
|
||||
return to_compare
|
||||
}
|
||||
|
||||
try {
|
||||
if (process.containsKey("maxRetries") && process.maxRetries && task.attempt == (process.maxRetries as int)) {
|
||||
return process.maxMemory
|
||||
}
|
||||
else if (to_compare.compareTo(process.maxMemory as nextflow.util.MemoryUnit) == 1) {
|
||||
return max_memory as nextflow.util.MemoryUnit
|
||||
}
|
||||
else {
|
||||
return to_compare
|
||||
}
|
||||
} catch (all) {
|
||||
println "Error processing memory resources. Please check that process.maxMemory '${process.maxMemory}' and process.maxRetries '${process.maxRetries}' are valid!"
|
||||
System.exit(1)
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,33 @@
|
||||
process {
|
||||
withLabel: lowmem { memory = 13.Gb }
|
||||
withLabel: lowcpu { cpus = 4 }
|
||||
withLabel: midmem { memory = 13.Gb }
|
||||
withLabel: midcpu { cpus = 4 }
|
||||
withLabel: highmem { memory = 13.Gb }
|
||||
withLabel: highcpu { cpus = 4 }
|
||||
withLabel: veryhighmem { memory = 13.Gb }
|
||||
withLabel: lowdisk {
|
||||
disk = {process.disk ? process.disk : null}
|
||||
}
|
||||
withLabel: middisk {
|
||||
disk = {process.disk ? process.disk : null}
|
||||
}
|
||||
withLabel: highdisk {
|
||||
disk = {process.disk ? process.disk : null}
|
||||
}
|
||||
withLabel: veryhighdisk {
|
||||
disk = {process.disk ? process.disk : null}
|
||||
}
|
||||
}
|
||||
|
||||
env.NUMBA_CACHE_DIR = '/tmp'
|
||||
|
||||
trace {
|
||||
enabled = true
|
||||
overwrite = true
|
||||
}
|
||||
dag {
|
||||
overwrite = true
|
||||
}
|
||||
|
||||
process.maxForks = 1
|
||||
@@ -0,0 +1,187 @@
|
||||
name: "move_files_to_directory"
|
||||
version: "v0.2.0"
|
||||
authors:
|
||||
- name: "Dorien Roosen"
|
||||
roles:
|
||||
- "maintainer"
|
||||
info:
|
||||
links:
|
||||
email: "dorien@data-intuitive.com"
|
||||
github: "dorien-er"
|
||||
linkedin: "dorien-roosen"
|
||||
organizations:
|
||||
- name: "Data Intuitive"
|
||||
href: "https://www.data-intuitive.com"
|
||||
role: "Data Scientist"
|
||||
argument_groups:
|
||||
- name: "Arguments"
|
||||
arguments:
|
||||
- type: "file"
|
||||
name: "--input"
|
||||
description: "Paths of the files that will be copied into the output directory."
|
||||
info: null
|
||||
must_exist: true
|
||||
create_parent: true
|
||||
required: true
|
||||
direction: "input"
|
||||
multiple: true
|
||||
multiple_sep: ";"
|
||||
- type: "file"
|
||||
name: "--output"
|
||||
description: "Path to output directory"
|
||||
info: null
|
||||
must_exist: true
|
||||
create_parent: true
|
||||
required: true
|
||||
direction: "output"
|
||||
multiple: false
|
||||
multiple_sep: ";"
|
||||
resources:
|
||||
- type: "bash_script"
|
||||
path: "script.sh"
|
||||
is_executable: true
|
||||
summary: "Publish one or multiple files to the same directory"
|
||||
description: "This component copies one or multiple files to the same destination\
|
||||
\ directory, creating the output directory if it doesn't exist."
|
||||
test_resources:
|
||||
- type: "bash_script"
|
||||
path: "test.sh"
|
||||
is_executable: true
|
||||
info: null
|
||||
status: "enabled"
|
||||
scope:
|
||||
image: "public"
|
||||
target: "public"
|
||||
requirements:
|
||||
commands:
|
||||
- "ps"
|
||||
license: "MIT"
|
||||
links:
|
||||
repository: "https://github.com/viash-hub/craftbox"
|
||||
runners:
|
||||
- type: "executable"
|
||||
id: "executable"
|
||||
docker_setup_strategy: "ifneedbepullelsecachedbuild"
|
||||
- type: "nextflow"
|
||||
id: "nextflow"
|
||||
directives:
|
||||
tag: "$id"
|
||||
auto:
|
||||
simplifyInput: true
|
||||
simplifyOutput: false
|
||||
transcript: false
|
||||
publish: false
|
||||
config:
|
||||
labels:
|
||||
mem1gb: "memory = 1000000000.B"
|
||||
mem2gb: "memory = 2000000000.B"
|
||||
mem5gb: "memory = 5000000000.B"
|
||||
mem10gb: "memory = 10000000000.B"
|
||||
mem20gb: "memory = 20000000000.B"
|
||||
mem50gb: "memory = 50000000000.B"
|
||||
mem100gb: "memory = 100000000000.B"
|
||||
mem200gb: "memory = 200000000000.B"
|
||||
mem500gb: "memory = 500000000000.B"
|
||||
mem1tb: "memory = 1000000000000.B"
|
||||
mem2tb: "memory = 2000000000000.B"
|
||||
mem5tb: "memory = 5000000000000.B"
|
||||
mem10tb: "memory = 10000000000000.B"
|
||||
mem20tb: "memory = 20000000000000.B"
|
||||
mem50tb: "memory = 50000000000000.B"
|
||||
mem100tb: "memory = 100000000000000.B"
|
||||
mem200tb: "memory = 200000000000000.B"
|
||||
mem500tb: "memory = 500000000000000.B"
|
||||
mem1gib: "memory = 1073741824.B"
|
||||
mem2gib: "memory = 2147483648.B"
|
||||
mem4gib: "memory = 4294967296.B"
|
||||
mem8gib: "memory = 8589934592.B"
|
||||
mem16gib: "memory = 17179869184.B"
|
||||
mem32gib: "memory = 34359738368.B"
|
||||
mem64gib: "memory = 68719476736.B"
|
||||
mem128gib: "memory = 137438953472.B"
|
||||
mem256gib: "memory = 274877906944.B"
|
||||
mem512gib: "memory = 549755813888.B"
|
||||
mem1tib: "memory = 1099511627776.B"
|
||||
mem2tib: "memory = 2199023255552.B"
|
||||
mem4tib: "memory = 4398046511104.B"
|
||||
mem8tib: "memory = 8796093022208.B"
|
||||
mem16tib: "memory = 17592186044416.B"
|
||||
mem32tib: "memory = 35184372088832.B"
|
||||
mem64tib: "memory = 70368744177664.B"
|
||||
mem128tib: "memory = 140737488355328.B"
|
||||
mem256tib: "memory = 281474976710656.B"
|
||||
mem512tib: "memory = 562949953421312.B"
|
||||
cpu1: "cpus = 1"
|
||||
cpu2: "cpus = 2"
|
||||
cpu5: "cpus = 5"
|
||||
cpu10: "cpus = 10"
|
||||
cpu20: "cpus = 20"
|
||||
cpu50: "cpus = 50"
|
||||
cpu100: "cpus = 100"
|
||||
cpu200: "cpus = 200"
|
||||
cpu500: "cpus = 500"
|
||||
cpu1000: "cpus = 1000"
|
||||
debug: false
|
||||
container: "docker"
|
||||
engines:
|
||||
- type: "docker"
|
||||
id: "docker"
|
||||
image: "debian:latest"
|
||||
target_registry: "images.viash-hub.com"
|
||||
target_tag: "v0.2.0"
|
||||
namespace_separator: "/"
|
||||
setup:
|
||||
- type: "apt"
|
||||
packages:
|
||||
- "procps"
|
||||
interactive: false
|
||||
entrypoint: []
|
||||
cmd: null
|
||||
- type: "native"
|
||||
id: "native"
|
||||
build_info:
|
||||
config: "src/move_files_to_directory/config.vsh.yaml"
|
||||
runner: "nextflow"
|
||||
engine: "docker|native"
|
||||
output: "target/nextflow/move_files_to_directory"
|
||||
executable: "target/nextflow/move_files_to_directory/main.nf"
|
||||
viash_version: "0.9.4"
|
||||
git_commit: "1c1b0a4a1aff891ab678072b0ba915ac3ac71610"
|
||||
git_remote: "https://github.com/viash-hub/craftbox"
|
||||
git_tag: "v0.1.0-8-g1c1b0a4"
|
||||
package_config:
|
||||
name: "craftbox"
|
||||
version: "v0.2.0"
|
||||
summary: "A collection of custom-tailored scripts and applied utilities built with\
|
||||
\ Viash.\n"
|
||||
description: "`craftbox` is a curated collection of custom scripts and utilities\
|
||||
\ designed to tackle context-specific tasks.\n\nEmphasizing the Viash principles,\
|
||||
\ `craftbox` components aim for **reusability**, **reproducibility**, and adherence\
|
||||
\ to **best practices**. Key features generally include:\n\n* **Standalone & Nextflow\
|
||||
\ Ready:** Components are built to run directly via the command line or be smoothly\
|
||||
\ integrated into Nextflow workflows.\n* **Custom Implementations:** Contains\
|
||||
\ scripts and tools developed for particular tasks that may not be found in broader\
|
||||
\ collections.\n* **High Quality Standards (promoted by Viash):**\n * Clear\
|
||||
\ documentation for components and their parameters.\n * Full exposure of underlying\
|
||||
\ script/tool arguments for fine-grained control.\n * Containerized (Docker)\
|
||||
\ to ensure dependency management and a consistent, reproducible runtime environment.\n\
|
||||
\ * Unit tested where applicable to ensure components function as expected.\n"
|
||||
info: null
|
||||
viash_version: "0.9.4"
|
||||
source: "src"
|
||||
target: "target"
|
||||
config_mods:
|
||||
- ".requirements.commands := ['ps']\n"
|
||||
- ".engines += { type: \"native\" }"
|
||||
- ".engines[.type == 'docker'].target_registry := 'images.viash-hub.com'"
|
||||
- ".engines[.type == 'docker'].target_tag := 'v0.2.0'"
|
||||
keywords:
|
||||
- "scripts"
|
||||
- "custom"
|
||||
- "implementations"
|
||||
- "utilities"
|
||||
license: "MIT"
|
||||
organization: "vsh"
|
||||
links:
|
||||
repository: "https://github.com/viash-hub/craftbox"
|
||||
issue_tracker: "https://github.com/viash-hub/craftbox/issues"
|
||||
File diff suppressed because it is too large
Load Diff
@@ -0,0 +1,126 @@
|
||||
manifest {
|
||||
name = 'move_files_to_directory'
|
||||
mainScript = 'main.nf'
|
||||
nextflowVersion = '!>=20.12.1-edge'
|
||||
version = 'v0.2.0'
|
||||
description = 'This component copies one or multiple files to the same destination directory, creating the output directory if it doesn\'t exist.'
|
||||
author = 'Dorien Roosen'
|
||||
}
|
||||
|
||||
process.container = 'nextflow/bash:latest'
|
||||
|
||||
// detect tempdir
|
||||
tempDir = java.nio.file.Paths.get(
|
||||
System.getenv('NXF_TEMP') ?:
|
||||
System.getenv('VIASH_TEMP') ?:
|
||||
System.getenv('TEMPDIR') ?:
|
||||
System.getenv('TMPDIR') ?:
|
||||
'/tmp'
|
||||
).toAbsolutePath()
|
||||
|
||||
profiles {
|
||||
no_publish {
|
||||
process {
|
||||
withName: '.*' {
|
||||
publishDir = [
|
||||
enabled: false
|
||||
]
|
||||
}
|
||||
}
|
||||
}
|
||||
mount_temp {
|
||||
docker.temp = tempDir
|
||||
podman.temp = tempDir
|
||||
charliecloud.temp = tempDir
|
||||
}
|
||||
docker {
|
||||
docker.enabled = true
|
||||
// docker.userEmulation = true
|
||||
singularity.enabled = false
|
||||
podman.enabled = false
|
||||
shifter.enabled = false
|
||||
charliecloud.enabled = false
|
||||
}
|
||||
singularity {
|
||||
singularity.enabled = true
|
||||
singularity.autoMounts = true
|
||||
docker.enabled = false
|
||||
podman.enabled = false
|
||||
shifter.enabled = false
|
||||
charliecloud.enabled = false
|
||||
}
|
||||
podman {
|
||||
podman.enabled = true
|
||||
docker.enabled = false
|
||||
singularity.enabled = false
|
||||
shifter.enabled = false
|
||||
charliecloud.enabled = false
|
||||
}
|
||||
shifter {
|
||||
shifter.enabled = true
|
||||
docker.enabled = false
|
||||
singularity.enabled = false
|
||||
podman.enabled = false
|
||||
charliecloud.enabled = false
|
||||
}
|
||||
charliecloud {
|
||||
charliecloud.enabled = true
|
||||
docker.enabled = false
|
||||
singularity.enabled = false
|
||||
podman.enabled = false
|
||||
shifter.enabled = false
|
||||
}
|
||||
}
|
||||
|
||||
process{
|
||||
withLabel: mem1gb { memory = 1000000000.B }
|
||||
withLabel: mem2gb { memory = 2000000000.B }
|
||||
withLabel: mem5gb { memory = 5000000000.B }
|
||||
withLabel: mem10gb { memory = 10000000000.B }
|
||||
withLabel: mem20gb { memory = 20000000000.B }
|
||||
withLabel: mem50gb { memory = 50000000000.B }
|
||||
withLabel: mem100gb { memory = 100000000000.B }
|
||||
withLabel: mem200gb { memory = 200000000000.B }
|
||||
withLabel: mem500gb { memory = 500000000000.B }
|
||||
withLabel: mem1tb { memory = 1000000000000.B }
|
||||
withLabel: mem2tb { memory = 2000000000000.B }
|
||||
withLabel: mem5tb { memory = 5000000000000.B }
|
||||
withLabel: mem10tb { memory = 10000000000000.B }
|
||||
withLabel: mem20tb { memory = 20000000000000.B }
|
||||
withLabel: mem50tb { memory = 50000000000000.B }
|
||||
withLabel: mem100tb { memory = 100000000000000.B }
|
||||
withLabel: mem200tb { memory = 200000000000000.B }
|
||||
withLabel: mem500tb { memory = 500000000000000.B }
|
||||
withLabel: mem1gib { memory = 1073741824.B }
|
||||
withLabel: mem2gib { memory = 2147483648.B }
|
||||
withLabel: mem4gib { memory = 4294967296.B }
|
||||
withLabel: mem8gib { memory = 8589934592.B }
|
||||
withLabel: mem16gib { memory = 17179869184.B }
|
||||
withLabel: mem32gib { memory = 34359738368.B }
|
||||
withLabel: mem64gib { memory = 68719476736.B }
|
||||
withLabel: mem128gib { memory = 137438953472.B }
|
||||
withLabel: mem256gib { memory = 274877906944.B }
|
||||
withLabel: mem512gib { memory = 549755813888.B }
|
||||
withLabel: mem1tib { memory = 1099511627776.B }
|
||||
withLabel: mem2tib { memory = 2199023255552.B }
|
||||
withLabel: mem4tib { memory = 4398046511104.B }
|
||||
withLabel: mem8tib { memory = 8796093022208.B }
|
||||
withLabel: mem16tib { memory = 17592186044416.B }
|
||||
withLabel: mem32tib { memory = 35184372088832.B }
|
||||
withLabel: mem64tib { memory = 70368744177664.B }
|
||||
withLabel: mem128tib { memory = 140737488355328.B }
|
||||
withLabel: mem256tib { memory = 281474976710656.B }
|
||||
withLabel: mem512tib { memory = 562949953421312.B }
|
||||
withLabel: cpu1 { cpus = 1 }
|
||||
withLabel: cpu2 { cpus = 2 }
|
||||
withLabel: cpu5 { cpus = 5 }
|
||||
withLabel: cpu10 { cpus = 10 }
|
||||
withLabel: cpu20 { cpus = 20 }
|
||||
withLabel: cpu50 { cpus = 50 }
|
||||
withLabel: cpu100 { cpus = 100 }
|
||||
withLabel: cpu200 { cpus = 200 }
|
||||
withLabel: cpu500 { cpus = 500 }
|
||||
withLabel: cpu1000 { cpus = 1000 }
|
||||
}
|
||||
|
||||
|
||||
@@ -0,0 +1,81 @@
|
||||
{
|
||||
"$schema": "http://json-schema.org/draft-07/schema",
|
||||
"title": "move_files_to_directory",
|
||||
"description": "This component copies one or multiple files to the same destination directory, creating the output directory if it doesn\u0027t exist.",
|
||||
"type": "object",
|
||||
"definitions": {
|
||||
|
||||
|
||||
|
||||
"arguments" : {
|
||||
"title": "Arguments",
|
||||
"type": "object",
|
||||
"description": "No description",
|
||||
"properties": {
|
||||
|
||||
|
||||
"input": {
|
||||
"type":
|
||||
"string",
|
||||
"description": "Type: List of `file`, required, multiple_sep: `\";\"`. Paths of the files that will be copied into the output directory",
|
||||
"help_text": "Type: List of `file`, required, multiple_sep: `\";\"`. Paths of the files that will be copied into the output directory."
|
||||
|
||||
}
|
||||
|
||||
|
||||
,
|
||||
"output": {
|
||||
"type":
|
||||
"string",
|
||||
"description": "Type: `file`, required, default: `$id.$key.output`. Path to output directory",
|
||||
"help_text": "Type: `file`, required, default: `$id.$key.output`. Path to output directory"
|
||||
,
|
||||
"default":"$id.$key.output"
|
||||
}
|
||||
|
||||
|
||||
}
|
||||
},
|
||||
|
||||
|
||||
"nextflow input-output arguments" : {
|
||||
"title": "Nextflow input-output arguments",
|
||||
"type": "object",
|
||||
"description": "Input/output parameters for Nextflow itself. Please note that both publishDir and publish_dir are supported but at least one has to be configured.",
|
||||
"properties": {
|
||||
|
||||
|
||||
"publish_dir": {
|
||||
"type":
|
||||
"string",
|
||||
"description": "Type: `string`, required, example: `output/`. Path to an output directory",
|
||||
"help_text": "Type: `string`, required, example: `output/`. Path to an output directory."
|
||||
|
||||
}
|
||||
|
||||
|
||||
,
|
||||
"param_list": {
|
||||
"type":
|
||||
"string",
|
||||
"description": "Type: `string`, example: `my_params.yaml`. Allows inputting multiple parameter sets to initialise a Nextflow channel",
|
||||
"help_text": "Type: `string`, example: `my_params.yaml`. Allows inputting multiple parameter sets to initialise a Nextflow channel. A `param_list` can either be a list of maps, a csv file, a json file, a yaml file, or simply a yaml blob.\n\n* A list of maps (as-is) where the keys of each map corresponds to the arguments of the pipeline. Example: in a `nextflow.config` file: `param_list: [ [\u0027id\u0027: \u0027foo\u0027, \u0027input\u0027: \u0027foo.txt\u0027], [\u0027id\u0027: \u0027bar\u0027, \u0027input\u0027: \u0027bar.txt\u0027] ]`.\n* A csv file should have column names which correspond to the different arguments of this pipeline. Example: `--param_list data.csv` with columns `id,input`.\n* A json or a yaml file should be a list of maps, each of which has keys corresponding to the arguments of the pipeline. Example: `--param_list data.json` with contents `[ {\u0027id\u0027: \u0027foo\u0027, \u0027input\u0027: \u0027foo.txt\u0027}, {\u0027id\u0027: \u0027bar\u0027, \u0027input\u0027: \u0027bar.txt\u0027} ]`.\n* A yaml blob can also be passed directly as a string. Example: `--param_list \"[ {\u0027id\u0027: \u0027foo\u0027, \u0027input\u0027: \u0027foo.txt\u0027}, {\u0027id\u0027: \u0027bar\u0027, \u0027input\u0027: \u0027bar.txt\u0027} ]\"`.\n\nWhen passing a csv, json or yaml file, relative path names are relativized to the location of the parameter file. No relativation is performed when `param_list` is a list of maps (as-is) or a yaml blob.",
|
||||
"hidden": true
|
||||
|
||||
}
|
||||
|
||||
|
||||
}
|
||||
}
|
||||
},
|
||||
"allOf": [
|
||||
|
||||
{
|
||||
"$ref": "#/definitions/arguments"
|
||||
},
|
||||
|
||||
{
|
||||
"$ref": "#/definitions/nextflow input-output arguments"
|
||||
}
|
||||
]
|
||||
}
|
||||
@@ -0,0 +1,266 @@
|
||||
name: "detect_ingestion_method"
|
||||
namespace: "ingestion_qc"
|
||||
version: "v0.1.0"
|
||||
authors:
|
||||
- name: "Dorien Roosen"
|
||||
roles:
|
||||
- "author"
|
||||
info:
|
||||
role: "Core Team Member"
|
||||
links:
|
||||
email: "dorien@data-intuitive.com"
|
||||
github: "dorien-er"
|
||||
linkedin: "dorien-roosen"
|
||||
organizations:
|
||||
- name: "Data Intuitive"
|
||||
href: "https://www.data-intuitive.com"
|
||||
role: "Data Scientist"
|
||||
- name: "Weiwei Schultz"
|
||||
roles:
|
||||
- "contributor"
|
||||
info:
|
||||
role: "Contributor"
|
||||
organizations:
|
||||
- name: "Janssen R&D US"
|
||||
role: "Associate Director Data Sciences"
|
||||
argument_groups:
|
||||
- name: "Inputs"
|
||||
arguments:
|
||||
- type: "file"
|
||||
name: "--input"
|
||||
description: "The input h5mu file(s)"
|
||||
info: null
|
||||
example:
|
||||
- "path/to/file.h5mu"
|
||||
must_exist: true
|
||||
create_parent: true
|
||||
required: true
|
||||
direction: "input"
|
||||
multiple: false
|
||||
multiple_sep: ";"
|
||||
- type: "string"
|
||||
name: "--modality"
|
||||
description: "The modality to use"
|
||||
info: null
|
||||
default:
|
||||
- "rna"
|
||||
required: false
|
||||
direction: "input"
|
||||
multiple: false
|
||||
multiple_sep: ";"
|
||||
- name: "Outputs"
|
||||
arguments:
|
||||
- type: "string"
|
||||
name: "--output_uns_ingestion_method"
|
||||
description: "The .uns field in which to store the exprimental setup. Values stored\
|
||||
\ are `cellranger_multi`, `xenium` or `cosmx`."
|
||||
info: null
|
||||
default:
|
||||
- "ingestion_method"
|
||||
required: false
|
||||
direction: "input"
|
||||
multiple: false
|
||||
multiple_sep: ";"
|
||||
- type: "file"
|
||||
name: "--output"
|
||||
description: "The output h5mu file, containing an .uns field with experiment description."
|
||||
info: null
|
||||
example:
|
||||
- "path/to/file.h5mu"
|
||||
must_exist: true
|
||||
create_parent: true
|
||||
required: true
|
||||
direction: "output"
|
||||
multiple: false
|
||||
multiple_sep: ";"
|
||||
resources:
|
||||
- type: "python_script"
|
||||
path: "script.py"
|
||||
is_executable: true
|
||||
- type: "file"
|
||||
path: "setup_logger.py"
|
||||
- type: "file"
|
||||
path: "nextflow_labels.config"
|
||||
dest: "nextflow_labels.config"
|
||||
description: "Detects the ingestion method of a dataset.\nCurrently detects either\
|
||||
\ 10X CellRanger Multi, 10X Xenium or Nanostring CosMx, but can be extended to other\
|
||||
\ technologies upon request.\n"
|
||||
test_resources:
|
||||
- type: "python_script"
|
||||
path: "test.py"
|
||||
is_executable: true
|
||||
- type: "file"
|
||||
path: "sample_one.qc.h5mu"
|
||||
- type: "file"
|
||||
path: "Lung5_Rep2_tiny.qc.h5mu"
|
||||
- type: "file"
|
||||
path: "xenium_tiny.qc.h5mu"
|
||||
info: null
|
||||
status: "enabled"
|
||||
scope:
|
||||
image: "public"
|
||||
target: "public"
|
||||
requirements:
|
||||
commands:
|
||||
- "ps"
|
||||
repositories:
|
||||
- type: "github"
|
||||
name: "openpipeline"
|
||||
repo: "openpipelines-bio/openpipeline"
|
||||
tag: "2.1.2"
|
||||
- type: "vsh"
|
||||
name: "craftbox"
|
||||
repo: "craftbox"
|
||||
tag: "v0.2.0"
|
||||
links:
|
||||
repository: "https://github.com/openpipelines-bio/openpipeline_qc"
|
||||
docker_registry: "ghcr.io"
|
||||
runners:
|
||||
- type: "executable"
|
||||
id: "executable"
|
||||
docker_setup_strategy: "ifneedbepullelsecachedbuild"
|
||||
- type: "nextflow"
|
||||
id: "nextflow"
|
||||
directives:
|
||||
label:
|
||||
- "lowmem"
|
||||
- "lowdisk"
|
||||
tag: "$id"
|
||||
auto:
|
||||
simplifyInput: true
|
||||
simplifyOutput: false
|
||||
transcript: false
|
||||
publish: false
|
||||
config:
|
||||
labels:
|
||||
mem1gb: "memory = 1000000000.B"
|
||||
mem2gb: "memory = 2000000000.B"
|
||||
mem5gb: "memory = 5000000000.B"
|
||||
mem10gb: "memory = 10000000000.B"
|
||||
mem20gb: "memory = 20000000000.B"
|
||||
mem50gb: "memory = 50000000000.B"
|
||||
mem100gb: "memory = 100000000000.B"
|
||||
mem200gb: "memory = 200000000000.B"
|
||||
mem500gb: "memory = 500000000000.B"
|
||||
mem1tb: "memory = 1000000000000.B"
|
||||
mem2tb: "memory = 2000000000000.B"
|
||||
mem5tb: "memory = 5000000000000.B"
|
||||
mem10tb: "memory = 10000000000000.B"
|
||||
mem20tb: "memory = 20000000000000.B"
|
||||
mem50tb: "memory = 50000000000000.B"
|
||||
mem100tb: "memory = 100000000000000.B"
|
||||
mem200tb: "memory = 200000000000000.B"
|
||||
mem500tb: "memory = 500000000000000.B"
|
||||
mem1gib: "memory = 1073741824.B"
|
||||
mem2gib: "memory = 2147483648.B"
|
||||
mem4gib: "memory = 4294967296.B"
|
||||
mem8gib: "memory = 8589934592.B"
|
||||
mem16gib: "memory = 17179869184.B"
|
||||
mem32gib: "memory = 34359738368.B"
|
||||
mem64gib: "memory = 68719476736.B"
|
||||
mem128gib: "memory = 137438953472.B"
|
||||
mem256gib: "memory = 274877906944.B"
|
||||
mem512gib: "memory = 549755813888.B"
|
||||
mem1tib: "memory = 1099511627776.B"
|
||||
mem2tib: "memory = 2199023255552.B"
|
||||
mem4tib: "memory = 4398046511104.B"
|
||||
mem8tib: "memory = 8796093022208.B"
|
||||
mem16tib: "memory = 17592186044416.B"
|
||||
mem32tib: "memory = 35184372088832.B"
|
||||
mem64tib: "memory = 70368744177664.B"
|
||||
mem128tib: "memory = 140737488355328.B"
|
||||
mem256tib: "memory = 281474976710656.B"
|
||||
mem512tib: "memory = 562949953421312.B"
|
||||
cpu1: "cpus = 1"
|
||||
cpu2: "cpus = 2"
|
||||
cpu5: "cpus = 5"
|
||||
cpu10: "cpus = 10"
|
||||
cpu20: "cpus = 20"
|
||||
cpu50: "cpus = 50"
|
||||
cpu100: "cpus = 100"
|
||||
cpu200: "cpus = 200"
|
||||
cpu500: "cpus = 500"
|
||||
cpu1000: "cpus = 1000"
|
||||
script:
|
||||
- "includeConfig(\"nextflow_labels.config\")"
|
||||
debug: false
|
||||
container: "docker"
|
||||
engines:
|
||||
- type: "docker"
|
||||
id: "docker"
|
||||
image: "python:3.12-slim"
|
||||
target_registry: "images.viash-hub.com"
|
||||
target_tag: "v0.1.0"
|
||||
namespace_separator: "/"
|
||||
setup:
|
||||
- type: "apt"
|
||||
packages:
|
||||
- "procps"
|
||||
interactive: false
|
||||
- type: "python"
|
||||
user: false
|
||||
packages:
|
||||
- "anndata~=0.11.1"
|
||||
- "mudata~=0.3.1"
|
||||
script:
|
||||
- "exec(\"try:\\n import awkward\\nexcept ModuleNotFoundError:\\n exit(0)\\\
|
||||
nelse: exit(1)\")"
|
||||
upgrade: true
|
||||
test_setup:
|
||||
- type: "apt"
|
||||
packages:
|
||||
- "git"
|
||||
interactive: false
|
||||
- type: "python"
|
||||
user: false
|
||||
packages:
|
||||
- "viashpy==0.8.0"
|
||||
github:
|
||||
- "openpipelines-bio/core#subdirectory=packages/python/openpipeline_testutils"
|
||||
upgrade: true
|
||||
entrypoint: []
|
||||
cmd: null
|
||||
- type: "native"
|
||||
id: "native"
|
||||
build_info:
|
||||
config: "src/ingestion_qc/detect_ingestion_method/config.vsh.yaml"
|
||||
runner: "executable"
|
||||
engine: "docker|native"
|
||||
output: "target/executable/ingestion_qc/detect_ingestion_method"
|
||||
executable: "target/executable/ingestion_qc/detect_ingestion_method/detect_ingestion_method"
|
||||
viash_version: "0.9.4"
|
||||
git_commit: "4de00a2614069bdaee27943e73a51d378e465c60"
|
||||
git_remote: "https://github.com/openpipelines-bio/openpipeline_qc"
|
||||
git_tag: "v0.1.0"
|
||||
package_config:
|
||||
name: "openpipeline_qc"
|
||||
version: "v0.1.0"
|
||||
info:
|
||||
test_resources:
|
||||
- type: "s3"
|
||||
path: "s3://openpipelines-bio/openpipeline_incubator/resources_test"
|
||||
dest: "resources_test"
|
||||
repositories:
|
||||
- type: "github"
|
||||
name: "openpipeline"
|
||||
repo: "openpipelines-bio/openpipeline"
|
||||
tag: "2.1.2"
|
||||
- type: "vsh"
|
||||
name: "craftbox"
|
||||
repo: "craftbox"
|
||||
tag: "v0.2.0"
|
||||
viash_version: "0.9.4"
|
||||
source: "src"
|
||||
target: "target"
|
||||
config_mods:
|
||||
- ".requirements.commands := ['ps']\n.runners[.type == 'nextflow'].directives.tag\
|
||||
\ := '$id'\n.resources += {path: '/src/configs/labels.config', dest: 'nextflow_labels.config'}\n\
|
||||
.runners[.type == 'nextflow'].config.script := 'includeConfig(\"nextflow_labels.config\"\
|
||||
)'"
|
||||
- ".engines += { type: \"native\" }"
|
||||
- ".engines[.type == 'docker'].target_registry := 'images.viash-hub.com'"
|
||||
- ".engines[.type == 'docker'].target_tag := 'v0.1.0'"
|
||||
organization: "vsh"
|
||||
links:
|
||||
repository: "https://github.com/openpipelines-bio/openpipeline_qc"
|
||||
docker_registry: "ghcr.io"
|
||||
1219
target/executable/ingestion_qc/detect_ingestion_method/detect_ingestion_method
Executable file
1219
target/executable/ingestion_qc/detect_ingestion_method/detect_ingestion_method
Executable file
File diff suppressed because it is too large
Load Diff
@@ -0,0 +1,66 @@
|
||||
process {
|
||||
// Default resources for components that hardly do any processing
|
||||
memory = { 2.GB * task.attempt }
|
||||
cpus = 1
|
||||
|
||||
// Retry for exit codes that have something to do with memory issues
|
||||
errorStrategy = { task.exitStatus in 137..140 ? 'retry' : 'terminate' }
|
||||
maxRetries = 3
|
||||
maxMemory = null
|
||||
|
||||
// CPU resources
|
||||
withLabel: singlecpu { cpus = 1 }
|
||||
withLabel: lowcpu { cpus = 4 }
|
||||
withLabel: midcpu { cpus = 10 }
|
||||
withLabel: highcpu { cpus = 20 }
|
||||
|
||||
// Memory resources
|
||||
withLabel: verylowmem { memory = { get_memory( 4.GB * task.attempt ) } }
|
||||
withLabel: lowmem { memory = { get_memory( 8.GB * task.attempt ) } }
|
||||
withLabel: midmem { memory = { get_memory( 16.GB * task.attempt ) } }
|
||||
withLabel: highmem { memory = { get_memory( 64.GB * task.attempt ) } }
|
||||
withLabel: veryhighmem { memory = { get_memory( 75.GB * task.attempt ) } }
|
||||
|
||||
// Disk space
|
||||
withLabel: lowdisk {
|
||||
disk = {process.disk ? process.disk : null}
|
||||
}
|
||||
withLabel: middisk {
|
||||
disk = {process.disk ? process.disk : null}
|
||||
}
|
||||
withLabel: highdisk {
|
||||
disk = {process.disk ? process.disk : null}
|
||||
}
|
||||
withLabel: veryhighdisk {
|
||||
disk = {process.disk ? process.disk : null}
|
||||
}
|
||||
// NOTE: The above labels intentionally do not have an effect by default.
|
||||
// The user should set the disk space requirements by adding the following
|
||||
// to the compute environment:
|
||||
//
|
||||
// withLabel: lowdisk { disk = { 20.GB * task.attempt } }
|
||||
// withLabel: middisk { disk = { 100.GB * task.attempt } }
|
||||
// withLabel: highdisk { disk = { 200.GB * task.attempt } }
|
||||
// withLabel: veryhighdisk { disk = { 500.GB * task.attempt } }
|
||||
}
|
||||
|
||||
def get_memory(to_compare) {
|
||||
if (!process.containsKey("maxMemory") || !process.maxMemory) {
|
||||
return to_compare
|
||||
}
|
||||
|
||||
try {
|
||||
if (process.containsKey("maxRetries") && process.maxRetries && task.attempt == (process.maxRetries as int)) {
|
||||
return process.maxMemory
|
||||
}
|
||||
else if (to_compare.compareTo(process.maxMemory as nextflow.util.MemoryUnit) == 1) {
|
||||
return max_memory as nextflow.util.MemoryUnit
|
||||
}
|
||||
else {
|
||||
return to_compare
|
||||
}
|
||||
} catch (all) {
|
||||
println "Error processing memory resources. Please check that process.maxMemory '${process.maxMemory}' and process.maxRetries '${process.maxRetries}' are valid!"
|
||||
System.exit(1)
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,12 @@
|
||||
def setup_logger():
|
||||
import logging
|
||||
from sys import stdout
|
||||
|
||||
logger = logging.getLogger()
|
||||
logger.setLevel(logging.INFO)
|
||||
console_handler = logging.StreamHandler(stdout)
|
||||
logFormatter = logging.Formatter("%(asctime)s %(levelname)-8s %(message)s")
|
||||
console_handler.setFormatter(logFormatter)
|
||||
logger.addHandler(console_handler)
|
||||
|
||||
return logger
|
||||
268
target/executable/ingestion_qc/generate_html/.config.vsh.yaml
Normal file
268
target/executable/ingestion_qc/generate_html/.config.vsh.yaml
Normal file
@@ -0,0 +1,268 @@
|
||||
name: "generate_html"
|
||||
namespace: "ingestion_qc"
|
||||
version: "v0.1.0"
|
||||
authors:
|
||||
- name: "Jakub Majercik"
|
||||
roles:
|
||||
- "author"
|
||||
info:
|
||||
role: "Contributor"
|
||||
links:
|
||||
email: "jakub@data-intuitive.com"
|
||||
github: "jakubmajercik"
|
||||
linkedin: "jakubmajercik"
|
||||
organizations:
|
||||
- name: "Data Intuitive"
|
||||
href: "https://www.data-intuitive.com"
|
||||
role: "Bioinformatics Engineer"
|
||||
- name: "Dorien Roosen"
|
||||
roles:
|
||||
- "author"
|
||||
info:
|
||||
role: "Core Team Member"
|
||||
links:
|
||||
email: "dorien@data-intuitive.com"
|
||||
github: "dorien-er"
|
||||
linkedin: "dorien-roosen"
|
||||
organizations:
|
||||
- name: "Data Intuitive"
|
||||
href: "https://www.data-intuitive.com"
|
||||
role: "Data Scientist"
|
||||
- name: "Robrecht Cannoodt"
|
||||
roles:
|
||||
- "author"
|
||||
info:
|
||||
role: "Core Team Member"
|
||||
links:
|
||||
email: "robrecht@data-intuitive.com"
|
||||
github: "rcannood"
|
||||
orcid: "0000-0003-3641-729X"
|
||||
linkedin: "robrechtcannoodt"
|
||||
organizations:
|
||||
- name: "Data Intuitive"
|
||||
href: "https://www.data-intuitive.com"
|
||||
role: "Data Science Engineer"
|
||||
- name: "Open Problems"
|
||||
href: "https://openproblems.bio"
|
||||
role: "Core Member"
|
||||
- name: "Weiwei Schultz"
|
||||
roles:
|
||||
- "contributor"
|
||||
info:
|
||||
role: "Contributor"
|
||||
organizations:
|
||||
- name: "Janssen R&D US"
|
||||
role: "Associate Director Data Sciences"
|
||||
argument_groups:
|
||||
- name: "Inputs"
|
||||
arguments:
|
||||
- type: "file"
|
||||
name: "--input_data"
|
||||
description: "The input JSON file containing the QC metrics"
|
||||
info: null
|
||||
example:
|
||||
- "path/to/file.json"
|
||||
must_exist: true
|
||||
create_parent: true
|
||||
required: true
|
||||
direction: "input"
|
||||
multiple: false
|
||||
multiple_sep: ";"
|
||||
- type: "file"
|
||||
name: "--input_structure"
|
||||
description: "The input JSON file containing the structure of the data"
|
||||
info: null
|
||||
example:
|
||||
- "path/to/file.json"
|
||||
must_exist: true
|
||||
create_parent: true
|
||||
required: true
|
||||
direction: "input"
|
||||
multiple: false
|
||||
multiple_sep: ";"
|
||||
- name: "Outputs"
|
||||
arguments:
|
||||
- type: "file"
|
||||
name: "--output_qc_report"
|
||||
description: "The output HTML report"
|
||||
info: null
|
||||
example:
|
||||
- "path/to/file.html"
|
||||
must_exist: true
|
||||
create_parent: true
|
||||
required: true
|
||||
direction: "output"
|
||||
multiple: false
|
||||
multiple_sep: ";"
|
||||
resources:
|
||||
- type: "bash_script"
|
||||
path: "script.sh"
|
||||
is_executable: true
|
||||
- type: "file"
|
||||
path: "nextflow_labels.config"
|
||||
dest: "nextflow_labels.config"
|
||||
description: "Generate an HTML report from the QC metrics"
|
||||
test_resources:
|
||||
- type: "bash_script"
|
||||
path: "test.sh"
|
||||
is_executable: true
|
||||
- type: "file"
|
||||
path: "sc_dataset.json"
|
||||
- type: "file"
|
||||
path: "sc_report_structure.json"
|
||||
- type: "file"
|
||||
path: "xenium_dataset.json"
|
||||
- type: "file"
|
||||
path: "xenium_report_structure.json"
|
||||
info: null
|
||||
status: "enabled"
|
||||
scope:
|
||||
image: "public"
|
||||
target: "public"
|
||||
requirements:
|
||||
commands:
|
||||
- "ps"
|
||||
repositories:
|
||||
- type: "github"
|
||||
name: "openpipeline"
|
||||
repo: "openpipelines-bio/openpipeline"
|
||||
tag: "2.1.2"
|
||||
- type: "vsh"
|
||||
name: "craftbox"
|
||||
repo: "craftbox"
|
||||
tag: "v0.2.0"
|
||||
links:
|
||||
repository: "https://github.com/openpipelines-bio/openpipeline_qc"
|
||||
docker_registry: "ghcr.io"
|
||||
runners:
|
||||
- type: "executable"
|
||||
id: "executable"
|
||||
docker_setup_strategy: "ifneedbepullelsecachedbuild"
|
||||
- type: "nextflow"
|
||||
id: "nextflow"
|
||||
directives:
|
||||
label:
|
||||
- "lowmem"
|
||||
- "lowdisk"
|
||||
tag: "$id"
|
||||
auto:
|
||||
simplifyInput: true
|
||||
simplifyOutput: false
|
||||
transcript: false
|
||||
publish: false
|
||||
config:
|
||||
labels:
|
||||
mem1gb: "memory = 1000000000.B"
|
||||
mem2gb: "memory = 2000000000.B"
|
||||
mem5gb: "memory = 5000000000.B"
|
||||
mem10gb: "memory = 10000000000.B"
|
||||
mem20gb: "memory = 20000000000.B"
|
||||
mem50gb: "memory = 50000000000.B"
|
||||
mem100gb: "memory = 100000000000.B"
|
||||
mem200gb: "memory = 200000000000.B"
|
||||
mem500gb: "memory = 500000000000.B"
|
||||
mem1tb: "memory = 1000000000000.B"
|
||||
mem2tb: "memory = 2000000000000.B"
|
||||
mem5tb: "memory = 5000000000000.B"
|
||||
mem10tb: "memory = 10000000000000.B"
|
||||
mem20tb: "memory = 20000000000000.B"
|
||||
mem50tb: "memory = 50000000000000.B"
|
||||
mem100tb: "memory = 100000000000000.B"
|
||||
mem200tb: "memory = 200000000000000.B"
|
||||
mem500tb: "memory = 500000000000000.B"
|
||||
mem1gib: "memory = 1073741824.B"
|
||||
mem2gib: "memory = 2147483648.B"
|
||||
mem4gib: "memory = 4294967296.B"
|
||||
mem8gib: "memory = 8589934592.B"
|
||||
mem16gib: "memory = 17179869184.B"
|
||||
mem32gib: "memory = 34359738368.B"
|
||||
mem64gib: "memory = 68719476736.B"
|
||||
mem128gib: "memory = 137438953472.B"
|
||||
mem256gib: "memory = 274877906944.B"
|
||||
mem512gib: "memory = 549755813888.B"
|
||||
mem1tib: "memory = 1099511627776.B"
|
||||
mem2tib: "memory = 2199023255552.B"
|
||||
mem4tib: "memory = 4398046511104.B"
|
||||
mem8tib: "memory = 8796093022208.B"
|
||||
mem16tib: "memory = 17592186044416.B"
|
||||
mem32tib: "memory = 35184372088832.B"
|
||||
mem64tib: "memory = 70368744177664.B"
|
||||
mem128tib: "memory = 140737488355328.B"
|
||||
mem256tib: "memory = 281474976710656.B"
|
||||
mem512tib: "memory = 562949953421312.B"
|
||||
cpu1: "cpus = 1"
|
||||
cpu2: "cpus = 2"
|
||||
cpu5: "cpus = 5"
|
||||
cpu10: "cpus = 10"
|
||||
cpu20: "cpus = 20"
|
||||
cpu50: "cpus = 50"
|
||||
cpu100: "cpus = 100"
|
||||
cpu200: "cpus = 200"
|
||||
cpu500: "cpus = 500"
|
||||
cpu1000: "cpus = 1000"
|
||||
script:
|
||||
- "includeConfig(\"nextflow_labels.config\")"
|
||||
debug: false
|
||||
container: "docker"
|
||||
engines:
|
||||
- type: "docker"
|
||||
id: "docker"
|
||||
image: "node:latest"
|
||||
target_registry: "images.viash-hub.com"
|
||||
target_tag: "v0.1.0"
|
||||
namespace_separator: "/"
|
||||
setup:
|
||||
- type: "apt"
|
||||
packages:
|
||||
- "git"
|
||||
interactive: false
|
||||
- type: "docker"
|
||||
run:
|
||||
- "npm install -g pnpm@latest-10 \\\n&& cd /opt && git clone -b v0.1.0 https://github.com/openpipelines-bio/siqc.git\
|
||||
\ \\\n&& cd siqc && pnpm install \\\n&& true\n"
|
||||
entrypoint: []
|
||||
cmd: null
|
||||
- type: "native"
|
||||
id: "native"
|
||||
build_info:
|
||||
config: "src/ingestion_qc/generate_html/config.vsh.yaml"
|
||||
runner: "executable"
|
||||
engine: "docker|native"
|
||||
output: "target/executable/ingestion_qc/generate_html"
|
||||
executable: "target/executable/ingestion_qc/generate_html/generate_html"
|
||||
viash_version: "0.9.4"
|
||||
git_commit: "4de00a2614069bdaee27943e73a51d378e465c60"
|
||||
git_remote: "https://github.com/openpipelines-bio/openpipeline_qc"
|
||||
git_tag: "v0.1.0"
|
||||
package_config:
|
||||
name: "openpipeline_qc"
|
||||
version: "v0.1.0"
|
||||
info:
|
||||
test_resources:
|
||||
- type: "s3"
|
||||
path: "s3://openpipelines-bio/openpipeline_incubator/resources_test"
|
||||
dest: "resources_test"
|
||||
repositories:
|
||||
- type: "github"
|
||||
name: "openpipeline"
|
||||
repo: "openpipelines-bio/openpipeline"
|
||||
tag: "2.1.2"
|
||||
- type: "vsh"
|
||||
name: "craftbox"
|
||||
repo: "craftbox"
|
||||
tag: "v0.2.0"
|
||||
viash_version: "0.9.4"
|
||||
source: "src"
|
||||
target: "target"
|
||||
config_mods:
|
||||
- ".requirements.commands := ['ps']\n.runners[.type == 'nextflow'].directives.tag\
|
||||
\ := '$id'\n.resources += {path: '/src/configs/labels.config', dest: 'nextflow_labels.config'}\n\
|
||||
.runners[.type == 'nextflow'].config.script := 'includeConfig(\"nextflow_labels.config\"\
|
||||
)'"
|
||||
- ".engines += { type: \"native\" }"
|
||||
- ".engines[.type == 'docker'].target_registry := 'images.viash-hub.com'"
|
||||
- ".engines[.type == 'docker'].target_tag := 'v0.1.0'"
|
||||
organization: "vsh"
|
||||
links:
|
||||
repository: "https://github.com/openpipelines-bio/openpipeline_qc"
|
||||
docker_registry: "ghcr.io"
|
||||
1163
target/executable/ingestion_qc/generate_html/generate_html
Executable file
1163
target/executable/ingestion_qc/generate_html/generate_html
Executable file
File diff suppressed because it is too large
Load Diff
@@ -0,0 +1,66 @@
|
||||
process {
|
||||
// Default resources for components that hardly do any processing
|
||||
memory = { 2.GB * task.attempt }
|
||||
cpus = 1
|
||||
|
||||
// Retry for exit codes that have something to do with memory issues
|
||||
errorStrategy = { task.exitStatus in 137..140 ? 'retry' : 'terminate' }
|
||||
maxRetries = 3
|
||||
maxMemory = null
|
||||
|
||||
// CPU resources
|
||||
withLabel: singlecpu { cpus = 1 }
|
||||
withLabel: lowcpu { cpus = 4 }
|
||||
withLabel: midcpu { cpus = 10 }
|
||||
withLabel: highcpu { cpus = 20 }
|
||||
|
||||
// Memory resources
|
||||
withLabel: verylowmem { memory = { get_memory( 4.GB * task.attempt ) } }
|
||||
withLabel: lowmem { memory = { get_memory( 8.GB * task.attempt ) } }
|
||||
withLabel: midmem { memory = { get_memory( 16.GB * task.attempt ) } }
|
||||
withLabel: highmem { memory = { get_memory( 64.GB * task.attempt ) } }
|
||||
withLabel: veryhighmem { memory = { get_memory( 75.GB * task.attempt ) } }
|
||||
|
||||
// Disk space
|
||||
withLabel: lowdisk {
|
||||
disk = {process.disk ? process.disk : null}
|
||||
}
|
||||
withLabel: middisk {
|
||||
disk = {process.disk ? process.disk : null}
|
||||
}
|
||||
withLabel: highdisk {
|
||||
disk = {process.disk ? process.disk : null}
|
||||
}
|
||||
withLabel: veryhighdisk {
|
||||
disk = {process.disk ? process.disk : null}
|
||||
}
|
||||
// NOTE: The above labels intentionally do not have an effect by default.
|
||||
// The user should set the disk space requirements by adding the following
|
||||
// to the compute environment:
|
||||
//
|
||||
// withLabel: lowdisk { disk = { 20.GB * task.attempt } }
|
||||
// withLabel: middisk { disk = { 100.GB * task.attempt } }
|
||||
// withLabel: highdisk { disk = { 200.GB * task.attempt } }
|
||||
// withLabel: veryhighdisk { disk = { 500.GB * task.attempt } }
|
||||
}
|
||||
|
||||
def get_memory(to_compare) {
|
||||
if (!process.containsKey("maxMemory") || !process.maxMemory) {
|
||||
return to_compare
|
||||
}
|
||||
|
||||
try {
|
||||
if (process.containsKey("maxRetries") && process.maxRetries && task.attempt == (process.maxRetries as int)) {
|
||||
return process.maxMemory
|
||||
}
|
||||
else if (to_compare.compareTo(process.maxMemory as nextflow.util.MemoryUnit) == 1) {
|
||||
return max_memory as nextflow.util.MemoryUnit
|
||||
}
|
||||
else {
|
||||
return to_compare
|
||||
}
|
||||
} catch (all) {
|
||||
println "Error processing memory resources. Please check that process.maxMemory '${process.maxMemory}' and process.maxRetries '${process.maxRetries}' are valid!"
|
||||
System.exit(1)
|
||||
}
|
||||
}
|
||||
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user