Build branch openpipeline_composed/add-integration-methods with version add-integration-methods to openpipeline_composed on branch add-integration-methods (b4f9d7f)

Build pipeline: vsh-ci-build-template-rcbnc

Source commit: b4f9d7fdb0

Source message: rename
This commit is contained in:
CI
2025-09-22 10:20:11 +00:00
commit ddc301140b
371 changed files with 200082 additions and 0 deletions

26
.gitignore vendored Normal file
View File

@@ -0,0 +1,26 @@
# IDEs and editors
/.idea
.project
.classpath
*.launch
.settings/
.vscode
# Temp
gitignore
test_results
# System Files
.DS_Store
Thumbs.db
# Nextflow
work
.nextflow*
trace-*.txt
# viash
/resources_test/
# pycache
*__pycache__*

3
CHANGELOG.md Normal file
View File

@@ -0,0 +1,3 @@
# openpipeline_runners x.x.x
Initial release containing a single-cell meta-workflow to process single cell omics samples, perform batch integration and/or label projectsion.

21
LICENSE Normal file
View File

@@ -0,0 +1,21 @@
MIT License
Copyright (c) 2025 openpipelines-bio
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in all
copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
SOFTWARE.

24
_viash.yaml Normal file
View File

@@ -0,0 +1,24 @@
viash_version: 0.9.4
source: src
target: target
name: openpipeline_composed
organization: vsh
links:
repository: https://github.com/openpipelines-bio/openpipeline_composed
docker_registry: ghcr.io
repositories:
- name: openpipeline
repo: openpipelines-bio/openpipeline
type: github
tag: 3.0.0
info:
test_resources:
- type: s3
path: s3://openpipelines-bio/openpipeline_incubator/resources_test
dest: resources_test
config_mods: |
.requirements.commands := ['ps']
.runners[.type == 'nextflow'].directives.tag := '$id'
.resources += {path: '/src/configs/labels.config', dest: 'nextflow_labels.config'}
.runners[.type == 'nextflow'].config.script := 'includeConfig("nextflow_labels.config")'
version: add-integration-methods

0
main.nf Normal file
View File

0
nextflow.config Normal file
View File

View File

@@ -0,0 +1,166 @@
#/bin/bash
OUT_DIR=resources_test/qc_sample_data
OUT_DIR_SPATIAL=resources_test/spatial_qc_sample_data
[ ! -d "$OUT_DIR" ] && mkdir -p "$OUT_DIR"
[ ! -d "$OUT_DIR_SPATIAL" ] && mkdir -p "$OUT_DIR_SPATIAL"
# fetch/create h5mu from somewhere
cat > /tmp/params_create_h5mu.yaml <<EOF
param_list:
- id: sample_one
input_id: sample_one
input: s3://openpipelines-data/10x_5k_anticmv/5k_human_antiCMV_T_TBNK_connect_qc.h5mu
- id: sample_two
input_id: sample_two
input: s3://openpipelines-data/10x_5k_anticmv/5k_human_antiCMV_T_TBNK_connect_qc.h5mu
output: '\$id.qc.h5mu'
output_compression: gzip
publish_dir: "$OUT_DIR"
EOF
# add the sample ID to the mudata object
nextflow run openpipelines-bio/openpipeline \
-latest \
-r 2.1.2 \
-main-script target/nextflow/metadata/add_id/main.nf \
-c src/configs/labels_ci.config \
-profile docker \
-params-file /tmp/params_create_h5mu.yaml \
-resume
cat > /tmp/params_subset.yaml <<EOF
param_list:
- id: sample_one
input: resources_test/qc_sample_data/sample_one.qc.h5mu
- id: sample_two
input: resources_test/qc_sample_data/sample_two.qc.h5mu
output: '\$id.qc.h5mu'
number_of_observations: 10000
output_compression: gzip
publish_dir: "$OUT_DIR"
EOF
# subset h5mus
nextflow run openpipelines-bio/openpipeline \
-latest \
-r 2.1.2 \
-main-script target/nextflow/filter/subset_h5mu/main.nf \
-c src/configs/labels_ci.config \
-profile docker \
-params-file /tmp/params_subset.yaml \
-resume
cat > /tmp/add_metadata_obs.py <<EOF
import mudata as mu
import glob
import numpy as np
import pandas as pd
import os
# Directory containing the h5mu files
out_dir = "$(pwd)/resources_test/qc_sample_data"
# List of h5mu files
h5mu_files = glob.glob(os.path.join(out_dir, "*.h5mu"))
print(f"Found {len(h5mu_files)} h5mu files: {h5mu_files}")
# Metadata values to randomly assign
donor_ids = ["donor_1", "donor_2", "donor_3"]
cell_types = ["CD4+ T cell", "CD8+ T cell", "B cell", "NK cell", "Monocyte"]
batches = ["batch_A", "batch_B"]
conditions = ["treated", "control"]
for h5mu_file in h5mu_files:
print(f"Processing {h5mu_file}...")
# Load MuData object
mdata = mu.read_h5mu(h5mu_file)
rna = mdata.mod["rna"]
n_obs = rna.n_obs
# Generate random metadata
np.random.seed(42 + hash(h5mu_file) % 100) # Different seed for each file but reproducible
# Create metadata
rna.obs["donor_id"] = np.random.choice(donor_ids, size=n_obs)
rna.obs["cell_type"] = np.random.choice(cell_types, size=n_obs)
rna.obs["batch"] = np.random.choice(batches, size=n_obs)
rna.obs["condition"] = np.random.choice(conditions, size=n_obs)
# Add a continuous variable too
rna.obs["quality_score"] = np.random.uniform(0, 1, size=n_obs)
# Save the modified MuData object
mu.write_h5mu(h5mu_file, mdata)
print(f"Added metadata to {h5mu_file}")
print("All files processed successfully!")
EOF
# Execute the Python script
python /tmp/add_metadata_obs.py
# generate cellbender out for testing
cat > /tmp/params_cellbender.yaml <<EOF
param_list:
- id: sample_one
input: resources_test/qc_sample_data/sample_one.qc.h5mu
- id: sample_two
input: resources_test/qc_sample_data/sample_two.qc.h5mu
output: '\$id.qc.cellbender.h5mu'
epochs: 5
output_compression: gzip
publish_dir: "$OUT_DIR"
EOF
nextflow run openpipelines-bio/openpipeline \
-latest \
-r 2.1.2 \
-main-script target/nextflow/correction/cellbender_remove_background/main.nf \
-c src/configs/labels_ci.config \
-profile docker \
-params-file /tmp/params_cellbender.yaml \
-resume
# fetch spatial sample data from s3
aws s3 sync \
--profile di \
s3://openpipelines-bio/openpipeline_incubator/resources_test/spatial_qc_sample_data \
"$OUT_DIR_SPATIAL"
# generate json for testing
viash run src/ingestion_qc/h5mu_to_qc_json/config.vsh.yaml --engine docker -- \
--input "$OUT_DIR"/sample_one.qc.cellbender.h5mu \
--input "$OUT_DIR"/sample_two.qc.cellbender.h5mu \
--ingestion_method cellranger_multi \
--obs_metadata "donor_id;cell_type;batch;condition" \
--output "$OUT_DIR"/sc_dataset.json \
--output_reporting_json "$OUT_DIR"/sc_report_structure.json
viash run src/ingestion_qc/h5mu_to_qc_json/config.vsh.yaml --engine docker -- \
--input "$OUT_DIR_SPATIAL"/xenium_tiny.qc.h5mu \
--input "$OUT_DIR_SPATIAL"/xenium_tiny.qc.h5mu \
--ingestion_method xenium \
--min_num_nonzero_vars 1 \
--output "$OUT_DIR_SPATIAL"/xenium_dataset.json \
--output_reporting_json "$OUT_DIR_SPATIAL"/xenium_report_structure.json
# remove all state yaml files
rm "$OUT_DIR"/*.yaml
rm "$OUT_DIR_SPATIAL"/*.yaml
# copy to s3
aws s3 sync \
"$OUT_DIR" \
s3://openpipelines-bio/openpipeline_incubator/"$OUT_DIR" \
--delete \
--dryrun
aws s3 sync \
"$OUT_DIR_SPATIAL" \
s3://openpipelines-bio/openpipeline_incubator/"$OUT_DIR_SPATIAL" \
--delete \
--dryrun

View File

@@ -0,0 +1,37 @@
#/bin/bash
OUT_DIR=resources_test/spatial_qc_sample_data
[ ! -d "$OUT_DIR" ] && mkdir -p "$OUT_DIR"
# fetch/create h5mu from somewhere
cat > /tmp/qc.yaml <<EOF
param_list:
- id: xenium_tiny
input: s3://openpipelines-bio/openpipeline_spatial/resources_test/xenium/xenium_tiny.h5mu
- id: Lung5_Rep2_tiny
input: s3://openpipelines-bio/openpipeline_spatial/resources_test/cosmx/Lung5_Rep2_tiny.h5mu
var_name_mitochondrial_genes: mitochondrial
var_name_ribosomal_genes: ribosomal
output: '\$id.qc.h5mu'
output_compression: gzip
publish_dir: "$OUT_DIR"
EOF
nextflow run openpipelines-bio/openpipeline \
-latest \
-r 2.1.0 \
-main-script target/nextflow/workflows/qc/qc/main.nf \
-profile docker \
-params-file /tmp/qc.yaml \
-resume \
-config src/configs/labels_ci.config
# copy to s3
aws s3 sync \
--profile di \
resources_test/spatial_qc_sample_data \
s3://openpipelines-bio/openpipeline_incubator/resources_test/spatial_qc_sample_data \
--delete --dryrun \
--exclude "*" --include "*.h5mu" \

View File

@@ -0,0 +1,11 @@
name: Dorien Roosen
info:
role: Core Team Member
links:
email: dorien@data-intuitive.com
github: dorien-er
linkedin: dorien-roosen
organizations:
- name: Data Intuitive
href: https://www.data-intuitive.com
role: Data Scientist

View File

@@ -0,0 +1,11 @@
name: Jakub Majercik
info:
role: Contributor
links:
email: jakub@data-intuitive.com
github: jakubmajercik
linkedin: jakubmajercik
organizations:
- name: Data Intuitive
href: https://www.data-intuitive.com
role: Bioinformatics Engineer

View File

@@ -0,0 +1,15 @@
name: Robrecht Cannoodt
info:
role: Core Team Member
links:
email: robrecht@data-intuitive.com
github: rcannood
orcid: "0000-0003-3641-729X"
linkedin: robrechtcannoodt
organizations:
- name: Data Intuitive
href: https://www.data-intuitive.com
role: Data Science Engineer
- name: Open Problems
href: https://openproblems.bio
role: Core Member

View File

@@ -0,0 +1,6 @@
name: Weiwei Schultz
info:
role: Contributor
organizations:
- name: Janssen R&D US
role: Associate Director Data Sciences

View File

@@ -0,0 +1,36 @@
profiles {
// detect tempdir
tempDir = java.nio.file.Paths.get(
System.getenv('NXF_TEMP') ?:
System.getenv('VIASH_TEMP') ?:
System.getenv('TEMPDIR') ?:
System.getenv('TMPDIR') ?:
'/tmp'
).toAbsolutePath()
mount_temp {
docker.temp = tempDir
podman.temp = tempDir
charliecloud.temp = tempDir
}
no_publish {
process {
withName: '.*' {
publishDir = [
enabled: false
]
}
}
}
docker {
docker.enabled = true
// docker.userEmulation = true
singularity.enabled = false
podman.enabled = false
shifter.enabled = false
charliecloud.enabled = false
}
}

66
src/configs/labels.config Normal file
View File

@@ -0,0 +1,66 @@
process {
// Default resources for components that hardly do any processing
memory = { 2.GB * task.attempt }
cpus = 1
// Retry for exit codes that have something to do with memory issues
errorStrategy = { task.exitStatus in 137..140 ? 'retry' : 'terminate' }
maxRetries = 3
maxMemory = null
// CPU resources
withLabel: singlecpu { cpus = 1 }
withLabel: lowcpu { cpus = 4 }
withLabel: midcpu { cpus = 10 }
withLabel: highcpu { cpus = 20 }
// Memory resources
withLabel: verylowmem { memory = { get_memory( 4.GB * task.attempt ) } }
withLabel: lowmem { memory = { get_memory( 8.GB * task.attempt ) } }
withLabel: midmem { memory = { get_memory( 16.GB * task.attempt ) } }
withLabel: highmem { memory = { get_memory( 64.GB * task.attempt ) } }
withLabel: veryhighmem { memory = { get_memory( 75.GB * task.attempt ) } }
// Disk space
withLabel: lowdisk {
disk = {process.disk ? process.disk : null}
}
withLabel: middisk {
disk = {process.disk ? process.disk : null}
}
withLabel: highdisk {
disk = {process.disk ? process.disk : null}
}
withLabel: veryhighdisk {
disk = {process.disk ? process.disk : null}
}
// NOTE: The above labels intentionally do not have an effect by default.
// The user should set the disk space requirements by adding the following
// to the compute environment:
//
// withLabel: lowdisk { disk = { 20.GB * task.attempt } }
// withLabel: middisk { disk = { 100.GB * task.attempt } }
// withLabel: highdisk { disk = { 200.GB * task.attempt } }
// withLabel: veryhighdisk { disk = { 500.GB * task.attempt } }
}
def get_memory(to_compare) {
if (!process.containsKey("maxMemory") || !process.maxMemory) {
return to_compare
}
try {
if (process.containsKey("maxRetries") && process.maxRetries && task.attempt == (process.maxRetries as int)) {
return process.maxMemory
}
else if (to_compare.compareTo(process.maxMemory as nextflow.util.MemoryUnit) == 1) {
return max_memory as nextflow.util.MemoryUnit
}
else {
return to_compare
}
} catch (all) {
println "Error processing memory resources. Please check that process.maxMemory '${process.maxMemory}' and process.maxRetries '${process.maxRetries}' are valid!"
System.exit(1)
}
}

View File

@@ -0,0 +1,105 @@
process {
withLabel: lowmem { memory = 13.Gb }
withLabel: lowcpu { cpus = 4 }
withLabel: midmem { memory = 13.Gb }
withLabel: midcpu { cpus = 4 }
withLabel: highmem { memory = 13.Gb }
withLabel: highcpu { cpus = 4 }
withLabel: veryhighmem { memory = 13.Gb }
withLabel: lowdisk {
disk = {process.disk ? process.disk : null}
}
withLabel: middisk {
disk = {process.disk ? process.disk : null}
}
withLabel: highdisk {
disk = {process.disk ? process.disk : null}
}
withLabel: veryhighdisk {
disk = {process.disk ? process.disk : null}
}
}
env.NUMBA_CACHE_DIR = '/tmp'
trace {
enabled = true
overwrite = true
}
dag {
overwrite = true
}
process.maxForks = 1
profiles {
// detect tempdir
tempDir = java.nio.file.Paths.get(
System.getenv('NXF_TEMP') ?:
System.getenv('VIASH_TEMP') ?:
System.getenv('TEMPDIR') ?:
System.getenv('TMPDIR') ?:
'/tmp'
).toAbsolutePath()
mount_temp {
docker.temp = tempDir
podman.temp = tempDir
charliecloud.temp = tempDir
}
no_publish {
process {
withName: '.*' {
publishDir = [
enabled: false
]
}
}
}
docker {
docker.fixOwnership = true
docker.enabled = true
singularity.enabled = false
podman.enabled = false
shifter.enabled = false
charliecloud.enabled = false
}
local {
// This config is for local processing.
process {
maxMemory = 25.GB
withLabel: verylowcpu { cpus = 2 }
withLabel: lowcpu { cpus = 4 }
withLabel: midcpu { cpus = 6 }
withLabel: highcpu { cpus = 12 }
withLabel: lowmem { memory = { get_memory( 8.GB * task.attempt ) } }
withLabel: midmem { memory = { get_memory( 12.GB * task.attempt ) } }
withLabel: highmem { memory = { get_memory( 20.GB * task.attempt ) } }
}
}
}
def get_memory(to_compare) {
if (!process.containsKey("maxMemory") || !process.maxMemory) {
return to_compare
}
try {
if (process.containsKey("maxRetries") && process.maxRetries && task.attempt == (process.maxRetries as int)) {
return process.maxMemory
}
else if (to_compare.compareTo(process.maxMemory as nextflow.util.MemoryUnit) == 1) {
return max_memory as nextflow.util.MemoryUnit
}
else {
return to_compare
}
} catch (all) {
println "Error processing memory resources. Please check that process.maxMemory '${process.maxMemory}' and process.maxRetries '${process.maxRetries}' are valid!"
System.exit(1)
}
}

View File

@@ -0,0 +1,385 @@
name: "process_integrate_annotate"
namespace: "single_cell"
description: |
A pipeline to process, integrate and annotate single cell (multi-)omics data.
Available integration methods:
- Harmony
- scVI
Available annotation methods:
- CellTypist
- scANVI (with scArches)
authors:
- __merge__: /src/authors/dorien_roosen.yaml
roles: [ author, maintainer ]
- __merge__: /src/authors/weiwei_schultz.yaml
roles: [ contributor ]
argument_groups:
- name: Input (query) data arguments
description: The input query dataset(s) to be annotated
arguments:
- name: "--id"
required: true
type: string
description: ID of the sample.
example: foo
- name: "--input"
required: true
type: file
description: Input query dataset(s) to be annotated
example: input.h5mu
- name: "--modality"
default: "rna"
type: string
description: Modality to be processed. Should match the modality in the --reference dataset, if provided.
- name: "--input_layer"
type: string
description: "The layer in the input data containing the raw counts, if .X is not to be used."
required: false
- name: "--input_var_gene_names"
type: string
required: false
description: |
The name of the adata var column containing gene names; when no gene_name_layer is provided, the var index will be used.
- name: "--input_reference_gene_overlap"
type: integer
default: 100
min: 1
description: |
The minimum number of genes present in both the reference and query datasets.
- name: Reference data arguments
description: Dataset to be used as a reference for label transfer and to train annotation algorithms on.
arguments:
- name: "--reference"
type: file
required: false
example: reference.h5mu
description: |
The reference dataset in .h5mu format to be used as a reference mapper and to train annotation algorithms on.
- name: "--reference_layer_raw_counts"
type: string
description: "The layer in the reference dataset containing the raw counts, if .X is not to be used."
required: false
- name: "--reference_layer_lognormalized_counts"
type: string
default: log_normalized
description: "The layer in the reference dataset containing the log-normalized counts, if .X is not to be used."
- name: "--reference_var_gene_names"
type: string
required: false
description: |
The name of the adata .var column containing gene names if the .var index is not to be used.
- name: "--reference_obs_batch"
type: string
required: false
description: |
The .obs column of the reference dataset containing the batch information.
- name: "--reference_obs_label"
type: string
example: cell_type
required: false
description: The `.obs` key of the target labels to tranfer.
- name: "--reference_obs_label_unlabeled_category"
type: string
default: "Unkown"
description: "Value in the --reference_obs_label field that indicates unlabeled observations"
- name: "--reference_var_input"
type: string
required: false
description: |
.var column containing highly variable genes. By default, do not subset genes.
- name: Methods
description: The available annotation and integration methods to integrate and/or annotate the query dataset(s) with.
arguments:
- name: "--integration_methods"
type: string
multiple: true
required: false
choices: [harmony, scvi]
example: harmony;scvi
description: Integration methods to be executed.
- name: "--annotation_methods"
type: string
multiple: true
required: false
choices: [celltypist, scanvi_scarches]
example: celltypist;scanvi_scarches
description: Annotation methods to be executed.
- name: "Pre-processing options: RNA filtering"
description: Pre-processing options for filtering RNA data
arguments:
- name: "--rna_min_counts"
example: 200
type: integer
description: Minimum number of counts captured per cell.
- name: "--rna_max_counts"
example: 5000000
type: integer
description: Maximum number of counts captured per cell.
- name: "--rna_min_genes_per_cell"
type: integer
example: 200
description: Minimum of non-zero values per cell.
- name: "--rna_max_genes_per_cell"
example: 1500000
type: integer
description: Maximum of non-zero values per cell.
- name: "--rna_min_cells_per_gene"
example: 3
type: integer
description: Minimum of non-zero values per gene.
- name: "--rna_min_fraction_mito"
example: 0
type: double
description: Minimum fraction of UMIs that are mitochondrial.
- name: "--rna_max_fraction_mito"
type: double
example: 0.2
description: Maximum fraction of UMIs that are mitochondrial.
- name: "Pre-processing options: Highly variable features detection"
description: Pre-processing options for detecting highly variable features
arguments:
- name: "--n_hvg"
type: integer
description: |
Number of highly-variable features to keep.
Only relevant if HVG need to be calculated across query and reference datasets (e.g. for --annotation_methods scvi_knn and harmony_knn).
For reference mapping-based methods, the HVG's specified in --reference_var_input will be used.
default: 2000
- name: "Pre-processing options: Mitochondrial & Ribosomal Gene Detection"
description: Pre-processing options for detecting mitochondrial genes
arguments:
- name: "--var_name_mitochondrial_genes"
type: string
required: false
description: |
In which .var slot to store a boolean array corresponding the mitochondrial genes.
- name: "--var_name_ribosomal_genes"
type: string
required: false
description: |
In which .var slot to store a boolean array corresponding the ribosomal genes.
- name: "--obs_name_mitochondrial_fraction"
type: string
required: false
description: |
When specified, write the fraction of counts originating from mitochondrial genes
(based on --mitochondrial_gene_regex) to an .obs column with the specified name.
Requires --var_name_mitochondrial_genes.
- name: "--obs_name_ribosomal_fraction"
type: string
required: false
description: |
When specified, write the fraction of counts originating from ribosomal genes
(based on --ribosomal_gene_regex) to an .obs column with the specified name.
Requires --var_name_ribosomal_genes.
- name: --mitochondrial_gene_regex
type: string
description: |
Regex string that identifies mitochondrial genes from --var_gene_names.
By default will detect human and mouse mitochondrial genes from a gene symbol.
required: false
default: "^[mM][tT]-"
- name: --ribosomal_gene_regex
type: string
description: |
Regex string that identifies ribosomal genes from --var_gene_names.
By default will detect human and mouse ribosomal genes from a gene symbol.
required: false
default: "^[Mm]?[Rr][Pp][LlSs]"
- name: "Pre-processing options: QC metrics calculation options"
description: Pre-processing options for calculating QC metrics
arguments:
- name: "--var_qc_metrics"
description: |
Keys to select a boolean (containing only True or False) column from .var.
For each cell, calculate the proportion of total values for genes which are labeled 'True',
compared to the total sum of the values for all genes. Defaults to the combined values specified for
--var_name_mitochondrial_genes and --highly_variable_features_var_output.
type: string
multiple: True
multiple_sep: ','
required: false
example: "ercc,highly_variable"
- name: Harmony integration options
description: Specifications for harmony integration.
arguments:
- name: "--harmony_theta"
type: double
description: |
Diversity clustering penalty parameter. Specify for each variable in group.by.vars.
theta=0 does not encourage any diversity. Larger values of theta
result in more diverse clusters."
default: 2
example: [0, 1, 2]
multiple: true
- name: "--harmony_obs_covariates"
type: string
description: "The .obs field(s) that define the covariate(s) to regress out."
example: ["batch", "sample"]
required: true
multiple: true
default: "sample_id"
- name: scVI, scANVI and scArches training options
# TODO - possibly provide separate training options for scVI, scANVI and scArches
description: Training arguments for scVI, scANVI and scArches. Relevant for --annotation_methods 'scvi_knn' and 'scanvi_scarches'.
arguments:
- name: "--early_stopping"
required: false
type: boolean
description: "Whether to perform early stopping with respect to the validation set."
- name: "--early_stopping_monitor"
choices: ["elbo_validation", "reconstruction_loss_validation", "kl_local_validation"]
default: "elbo_validation"
type: string
description: "Metric logged during validation set epoch."
- name: "--early_stopping_patience"
type: integer
min: 1
default: 45
description: "Number of validation epochs with no improvement after which training will be stopped."
- name: "--early_stopping_min_delta"
min: 0
type: double
default: 0.0
description: "Minimum change in the monitored quantity to qualify as an improvement, i.e. an absolute change of less than min_delta, will count as no improvement."
- name: "--max_epochs"
type: integer
description: "Number of passes through the dataset, defaults to (20000 / number of cells) * 400 or 400; whichever is smallest."
required: false
- name: "--reduce_lr_on_plateau"
description: "Whether to monitor validation loss and reduce learning rate when validation set `lr_scheduler_metric` plateaus."
type: boolean
default: True
- name: "--lr_factor"
description: "Factor to reduce learning rate."
type: double
default: 0.6
min: 0
- name: "--lr_patience"
description: "Number of epochs with no improvement after which learning rate will be reduced."
type: double
default: 30
min: 0
- name: CellTypist reference model
description: The CellTypist reference model to use for annotation. If not provided, the reference dataset will be used for model training.
arguments:
- name: "--celltypist_model"
type: file
description: "Pretrained model in pkl format. If not provided, the model will be trained on the reference data and --reference should be provided."
required: false
example: pretrained_model.pkl
- name: CellTypist annotation options
description: Specifications for CellTypist annotation.
arguments:
- name: "--celltypist_feature_selection"
type: boolean
description: "Whether to perform feature selection."
default: false
- name: "--celltypist_majority_voting"
type: boolean
description: "Whether to refine the predicted labels by running the majority voting classifier after over-clustering."
default: false
- name: "--celltypist_C"
type: double
description: "Inverse of regularization strength in logistic regression."
default: 1.0
- name: "--celltypist_max_iter"
type: integer
description: "Maximum number of iterations before reaching the minimum of the cost function."
default: 1000
- name: "--celltypist_use_SGD"
type: boolean_true
description: "Whether to use the stochastic gradient descent algorithm."
- name: "--celltypist_min_prop"
type: double
description: |
"For the dominant cell type within a subcluster, the minimum proportion of cells required to
support naming of the subcluster by this cell type. Ignored if majority_voting is set to False.
Subcluster that fails to pass this proportion threshold will be assigned 'Heterogeneous'."
default: 0
- name: Clustering options
description: Arguments for Leiden clustering. Only relevant for --annotation_methods `scvi_knn`, `scanvi_scarches` and `harmony_knn`.
arguments:
- name: "--leiden_resolution"
type: double
description: Control the coarseness of the clustering. Higher values lead to more clusters.
default: [1]
multiple: true
- name: Neighbor classifier arguments
description: Arguments related to calculating the n nearest neighbors. Only relevant for --annotation_methods `scvi_knn`, `scanvi_scarches` and `harmony_knn`.
arguments:
- name: "--knn_weights"
type: string
default: "uniform"
choices: ["uniform", "distance"]
description: |
Weight function used in prediction. Possible values are:
`uniform` (all points in each neighborhood are weighted equally) or
`distance` (weight points by the inverse of their distance)
- name: "--knn_n_neighbors"
type: integer
default: 15
min: 5
required: false
description: |
The number of neighbors to use in k-neighbor graph structure used for fast approximate nearest neighbor search with PyNNDescent.
Larger values will result in more accurate search results at the cost of computation time.
- name: Outputs
description: The output file to write the annotated dataset to.
arguments:
- name: "--output"
type: file
direction: output
required: true
description: |
The output file.
example: output.h5mu
dependencies:
- name: workflows/multiomics/process_samples
alias: process_samples_workflow
repository: openpipeline
- name: annotate/celltypist
repository: openpipeline
alias: celltypist_annotation
- name: workflows/annotation/scanvi_scarches
repository: openpipeline
alias: scanvi_scarches_annotation
- name: workflows/integration/harmony_leiden
repository: openpipeline
alias: harmony_integration
- name: workflows/integration/scvi_leiden
repository: openpipeline
alias: scvi_integration
resources:
- type: nextflow_script
path: main.nf
entrypoint: run_wf
test_resources:
- type: nextflow_script
path: test.nf
entrypoint: test_wf
- path: /resources_test/pbmc_1k_protein_v3/pbmc_1k_protein_v3_mms.h5mu
- path: /resources_test/annotation_test_data/TS_Blood_filtered.h5mu
- path: /resources_test/annotation_test_data/celltypist_model_Immune_All_Low.pkl
runners:
- type: nextflow

View File

@@ -0,0 +1,37 @@
#!/bin/bash
set -eo pipefail
# get the root of the directory
REPO_ROOT=$(git rev-parse --show-toplevel)
# ensure that the command below is run from the root of the repository
cd "$REPO_ROOT"
nextflow \
run . \
-main-script src/single_cell/process_integrate_annotate/test.nf \
-entry test_wf \
-resume \
-profile docker \
-c src/configs/labels_ci.config \
-c src/configs/integration_tests.config \
--publish_dir test
nextflow \
run . \
-main-script src/single_cell/process_integrate_annotate/test.nf \
-profile docker,no_publish \
-resume \
-entry test_wf_2 \
-c src/configs/labels_ci.config \
-c src/configs/integration_tests.config
nextflow \
run . \
-main-script src/single_cell/process_integrate_annotate/test.nf \
-profile docker,no_publish \
-resume \
-entry test_wf_3 \
-c src/configs/labels_ci.config \
-c src/configs/integration_tests.config

View File

@@ -0,0 +1,210 @@
workflow run_wf {
take:
input_ch
main:
output_ch = input_ch
| map { id, state ->
def new_state = state + [ "query_processed": state.output, "_meta": ["join_id": id] ]
[id, new_state]
}
// Make sure parameters are filled out correctly
| map { id, state ->
def new_state = [:]
// Check that at least one of annotation_methods or integration_methods is not empty
if (!state.annotation_methods && !state.integration_methods) {
throw new RuntimeException("At least one of --annotation_methods or --integration_methods must be provided")
}
// Check CellTypist arguments
if (state.annotation_methods && state.annotation_methods.contains("celltypist") &&
(!state.celltypist_model && !state.reference)) {
throw new RuntimeException("Celltypist was selected as an annotation method. Either --celltypist_model or --reference must be provided.")
}
if (state.annotation_methods && state.annotation_methods.contains("celltypist") && state.celltypist_model && state.reference ) {
System.err.println(
"Warning: --celltypist_model is set and a --reference was provided. \
The pre-trained Celltypist model will be used for annotation, the reference will be ignored."
)
}
[id, state + new_state]
}
| process_samples_workflow.run(
fromState: [
"input": "input",
"id": "id",
"rna_layer": "input_layer",
"rna_min_counts": "rna_min_counts",
"rna_max_counts": "rna_max_counts",
"rna_min_genes_per_cell": "rna_min_genes_per_cell",
"rna_max_genes_per_cell": "rna_max_genes_per_cell",
"rna_min_cells_per_gene": "rna_min_cells_per_gene",
"rna_min_fraction_mito": "rna_min_fraction_mito",
"rna_max_fraction_mito": "rna_max_fraction_mito",
"rna_min_fraction_ribo": "rna_min_fraction_ribo",
"rna_max_fraction_ribo": "rna_max_fraction_ribo",
"var_name_mitochondrial_genes": "var_name_mitochondrial_genes",
"var_name_ribosomal_genes": "var_name_ribosomal_genes",
"var_gene_names": "input_var_gene_names",
"mitochondrial_gene_regex": "mitochondrial_gene_regex",
"ribosomal_gene_regex": "ribosomal_gene_regex",
"var_qc_metrics": "var_qc_metrics"
],
args: [
"pca_overwrite": "true",
"add_id_obs_output": "sample_id",
"highly_variable_features_var_output": "filter_with_hvg_query"
],
toState: ["query_processed": "output"],
)
// Integration methods
| harmony_integration.run(
runIf: { id, state ->
state.integration_methods && state.integration_methods.contains("harmony")
},
fromState: [
"id": "id",
"input": "query_processed",
"modality": "modality",
"theta": "harmony_theta",
"leiden_resolution": "leiden_resolution",
"obs_covariates": "harmony_obs_covariates"
],
args: [
"layer": "log_normalized",
"embedding": "X_pca",
"obsm_integrated": "X_harmony_integrated",
"uns_neighbors": "harmony_integration_neighbors",
"obsp_neighbor_distances": "harmony_integration_neighbor_distances",
"obsp_neighbor_connectivities": "harmony_integration_neighbor_connectivities",
"obs_cluster": "harmony_integration_leiden",
"obsm_umap": "X_harmony_umap"
],
toState: [ "query_processed": "output" ]
)
| scvi_integration.run(
runIf: { id, state ->
state.integration_methods && state.integration_methods.contains("scvi")
},
fromState: [
"id": "id",
"input": "query_processed",
"layer": "input_layer",
"modality": "modality",
"leiden_resolution": "leiden_resolution",
"early_stopping": "early_stopping",
"early_stopping_monitor": "early_stopping_monitor",
"early_stopping_patience": "early_stopping_patience",
"early_stopping_min_delta": "early_stopping_min_delta",
"max_epochs": "max_epochs",
"reduce_lr_on_plateau": "reduce_lr_on_plateau",
"lr_factor": "lr_factor",
"lr_patience": "lr_patience"
],
args: [
"obsm_output": "X_scvi_integrated",
"obs_batch": "sample_id",
"var_input": "filter_with_hvg_query",
"uns_neighbors": "scvi_integration_neighbors",
"obsp_neighbor_distances": "scvi_integration_neighbor_distances",
"obsp_neighbor_connectivities": "scvi_integration_neighbor_connectivities",
"obs_cluster": "scvi_integration_leiden",
"obsm_umap": "X_scvi_umap"
],
toState: [ "query_processed": "output", "scvi_model": "output_model" ]
)
// Annotation methods
| celltypist_annotation.run(
runIf: { id, state -> state.annotation_methods && state.annotation_methods.contains("celltypist") && state.celltypist_model },
fromState: [
"input": "query_processed",
"modality": "modality",
"input_var_gene_names": "input_var_gene_names",
"input_reference_gene_overlap": "input_reference_gene_overlap",
"model": "celltypist_model",
"majority_voting": "celltypist_majority_voting"
],
args: [
// log normalized counts are expected for celltypist
"input_layer": "log_normalized",
"output_obs_predictions": "celltypist_pred",
"output_obs_probability": "celltypist_proba"
],
toState: [ "query_processed": "output" ]
)
| celltypist_annotation.run(
runIf: { id, state -> state.annotation_methods && state.annotation_methods.contains("celltypist") && !state.celltypist_model },
fromState: [
"input": "query_processed",
"modality": "modality",
"input_var_gene_names": "input_var_gene_names",
"input_reference_gene_overlap": "input_reference_gene_overlap",
"reference": "reference",
"reference_layer": "reference_layer_lognormalized_counts",
"reference_obs_target": "reference_obs_label",
"reference_var_gene_names": "reference_var_gene_names",
"reference_obs_batch": "reference_obs_batch",
"reference_var_input": "reference_var_input",
"feature_selection": "celltypist_feature_selection",
"C": "celltypist_C",
"max_iter": "celltypist_max_iter",
"use_SGD": "celltypist_use_SGD",
"min_prop": "celltypist_min_prop",
"majority_voting": "celltypist_majority_voting"
],
args: [
// log normalized counts are expected for celltypist
"input_layer": "log_normalized",
"output_obs_predictions": "celltypist_pred",
"output_obs_probability": "celltypist_proba"
],
toState: [ "query_processed": "output" ]
)
| scanvi_scarches_annotation.run(
runIf: { id, state -> state.annotation_methods && state.annotation_methods.contains("scanvi_scarches")},
fromState: [
"id": "id",
"input": "query_processed",
"modality": "modality",
"layer": "input_layer",
"input_var_gene_names": "input_var_gene_names",
"reference": "reference",
"reference_obs_target": "reference_obs_label",
"reference_obs_batch_label": "reference_obs_batch",
"reference_var_hvg": "reference_var_input",
"reference_var_gene_names": "reference_var_gene_names",
"unlabeled_category": "reference_obs_label_unlabeled_category",
"early_stopping": "early_stopping",
"early_stopping_monitor": "early_stopping_monitor",
"early_stopping_patience": "early_stopping_patience",
"early_stopping_min_delta": "early_stopping_min_delta",
"max_epochs": "max_epochs",
"reduce_lr_on_plateau": "reduce_lr_on_plateau",
"lr_factor": "lr_factor",
"lr_patience": "lr_patience",
"leiden_resolution": "leiden_resolution",
"knn_weights": "knn_weights",
"knn_n_neighbors": "knn_n_neighbors"
],
args: [
"input_obs_batch_label": "sample_id",
"output_obs_predictions": "scanvi_knn_pred",
"output_obs_probability": "scanvi_knn_proba"
],
toState: [ "query_processed": "output" ]
)
| map {id, state ->
def new_state = state + ["output": state.query_processed]
[id, new_state]
}
| setState(["output", "_meta"])
emit:
output_ch
}

View File

@@ -0,0 +1,10 @@
manifest {
nextflowVersion = '!>=20.12.1-edge'
}
params {
rootDir = java.nio.file.Paths.get("$projectDir/../../../").toAbsolutePath().normalize().toString()
}
// include common settings
includeConfig("${params.rootDir}/src/configs/labels.config")

View File

@@ -0,0 +1,151 @@
nextflow.enable.dsl=2
include { process_integrate_annotate } from params.rootDir + "/target/nextflow/single_cell/process_integrate_annotate/main.nf"
params.resources_test = "s3://openpipelines-bio/openpipeline_incubator/resources_test/"
workflow test_wf {
resources_test = file(params.resources_test)
output_ch = Channel.fromList(
[
[
id: "simple_annotation_test",
input: resources_test.resolve("pbmc_1k_protein_v3/pbmc_1k_protein_v3_mms.h5mu"),
reference: resources_test.resolve("annotation_test_data/TS_Blood_filtered.h5mu"),
reference_var_gene_names: "ensemblid",
reference_layer_lognormalized_counts: "log_normalized",
reference_obs_batch: "donor_assay",
reference_obs_label: "cell_type",
max_epochs: "5",
annotation_methods: "celltypist;scanvi_scarches"
],
[
id: "simple_integration_test",
input: resources_test.resolve("pbmc_1k_protein_v3/pbmc_1k_protein_v3_mms.h5mu"),
integration_methods: "harmony;scvi"
],
[
id: "simple_execution_test",
input: resources_test.resolve("pbmc_1k_protein_v3/pbmc_1k_protein_v3_mms.h5mu"),
reference: resources_test.resolve("annotation_test_data/TS_Blood_filtered.h5mu"),
reference_var_gene_names: "ensemblid",
reference_layer_lognormalized_counts: "log_normalized",
reference_obs_batch: "donor_assay",
reference_obs_label: "cell_type",
max_epochs: "5",
annotation_methods: "scanvi_scarches",
integration_methods: "harmony"
]
])
| view {"State at start: $it"}
| map{ state -> [state.id, state] }
| process_integrate_annotate
| view {"After AaaS: $it"}
| view { output ->
assert output.size() == 2 : "Outputs should contain two elements; [id, state]"
// check id
def id = output[0]
assert id == "merged" : "Output ID should be `merged`"
// check output
def state = output[1]
assert state instanceof Map : "State should be a map. Found: ${state}"
assert state.containsKey("output") : "Output should contain key 'output'."
assert state.output.isFile() : "'output' should be a file."
assert state.output.toString().endsWith(".h5mu") : "Output file should end with '.h5mu'. Found: ${state.output}"
"Output: $output"
}
}
workflow test_wf_2 {
resources_test = file(params.resources_test)
output_ch = Channel.fromList(
[
[
id: "pbmc_with_more_params",
input: resources_test.resolve("pbmc_1k_protein_v3/pbmc_1k_protein_v3_mms.h5mu"),
rna_min_counts: 2,
rna_max_counts: 1000000,
rna_min_genes_per_cell: 1,
rna_max_genes_per_cell: 1000000,
rna_min_cells_per_gene: 1,
rna_min_fraction_mito: 0.0,
rna_max_fraction_mito: 1.0,
prot_min_counts: 3,
prot_max_counts: 1000000,
prot_min_proteins_per_cell: 1,
prot_max_proteins_per_cell: 1000000,
prot_min_cells_per_protein: 1,
var_name_mitochondrial_genes: 'mitochondrial',
obs_name_mitochondrial_fraction: 'fraction_mitochondrial',
add_id_to_obs: true,
add_id_make_observation_keys_unique: true,
add_id_obs_output: "sample_id",
reference: resources_test.resolve("annotation_test_data/TS_Blood_filtered.h5mu"),
reference_var_gene_names: "ensemblid",
reference_layer_lognormalized_counts: "log_normalized",
reference_obs_batch: "donor_assay",
reference_obs_label: "cell_type",
annotation_methods: "celltypist",
integration_methods: "scvi"
]
])
| view {"State at start: $it"}
| map { state -> [state.id, state] }
| process_integrate_annotate
| view {"After AaaS: $it"}
| view { output ->
assert output.size() == 2 : "Outputs should contain two elements; [id, state]"
// check id
def id = output[0]
assert id == "merged" : "Output ID should be `merged`"
// check output
def state = output[1]
assert state instanceof Map : "State should be a map. Found: ${state}"
assert state.containsKey("output") : "Output should contain key 'output'."
assert state.output.isFile() : "'output' should be a file."
assert state.output.toString().endsWith(".h5mu") : "Output file should end with '.h5mu'. Found: ${state.output}"
"Output: $output"
}
}
workflow test_wf_3 {
resources_test = file(params.resources_test)
output_ch = Channel.fromList(
[
[
id: "celltypist_model",
input: resources_test.resolve("pbmc_1k_protein_v3/pbmc_1k_protein_v3_mms.h5mu"),
celltypist_model: resources_test.resolve("annotation_test_data/celltypist_model_Immune_All_Low.pkl"),
annotation_methods: "celltypist",
input_var_gene_names: "gene_symbol"
]
])
| view {"State at start: $it"}
| map{ state -> [state.id, state] }
| process_integrate_annotate
| view {"After AaaS: $it"}
| view { output ->
assert output.size() == 2 : "Outputs should contain two elements; [id, state]"
// check id
def id = output[0]
assert id == "merged" : "Output ID should be `merged`"
// check output
def state = output[1]
assert state instanceof Map : "State should be a map. Found: ${state}"
assert state.containsKey("output") : "Output should contain key 'output'."
assert state.output.isFile() : "'output' should be a file."
assert state.output.toString().endsWith(".h5mu") : "Output file should end with '.h5mu'. Found: ${state.output}"
"Output: $output"
}
}

0
target/.build.yaml Normal file
View File

View File

@@ -0,0 +1,228 @@
name: "split_modalities"
namespace: "workflows/multiomics"
version: "3.0.0"
authors:
- name: "Dries Schaumont"
roles:
- "author"
- "maintainer"
info:
role: "Core Team Member"
links:
email: "dries@data-intuitive.com"
github: "DriesSchaumont"
orcid: "0000-0002-4389-0440"
linkedin: "dries-schaumont"
organizations:
- name: "Data Intuitive"
href: "https://www.data-intuitive.com"
role: "Data Scientist"
argument_groups:
- name: "Inputs"
arguments:
- type: "string"
name: "--id"
description: "ID of the sample."
info: null
example:
- "foo"
required: true
direction: "input"
multiple: false
multiple_sep: ";"
- type: "file"
name: "--input"
alternatives:
- "-i"
description: "Path to the sample."
info: null
example:
- "input.h5mu"
must_exist: true
create_parent: true
required: true
direction: "input"
multiple: false
multiple_sep: ";"
- name: "Outputs"
arguments:
- type: "file"
name: "--output"
alternatives:
- "-o"
description: "Output directory containing multiple h5mu files."
info: null
example:
- "/path/to/output"
must_exist: true
create_parent: true
required: true
direction: "output"
multiple: false
multiple_sep: ";"
- type: "file"
name: "--output_types"
description: "A csv containing the base filename and modality type per output\
\ file."
info: null
example:
- "types.csv"
must_exist: true
create_parent: true
required: true
direction: "output"
multiple: false
multiple_sep: ";"
resources:
- type: "nextflow_script"
path: "main.nf"
is_executable: true
entrypoint: "run_wf"
- type: "file"
path: "utils"
- type: "file"
path: "nextflow_labels.config"
dest: "nextflow_labels.config"
description: "A pipeline to split a multimodal mudata files into several unimodal\
\ mudata files."
test_resources:
- type: "nextflow_script"
path: "test.nf"
is_executable: true
entrypoint: "test_wf"
- type: "file"
path: "pbmc_1k_protein_v3_filtered_feature_bc_matrix.h5mu"
info:
test_dependencies:
- name: "split_modalities_test"
namespace: "test_workflows/multiomics"
status: "enabled"
scope:
image: "private"
target: "private"
dependencies:
- name: "dataflow/split_modalities"
alias: "split_modalities_component"
repository:
type: "local"
license: "MIT"
links:
repository: "https://github.com/openpipelines-bio/openpipeline"
docker_registry: "ghcr.io"
runners:
- type: "nextflow"
id: "nextflow"
directives:
tag: "$id"
auto:
simplifyInput: true
simplifyOutput: false
transcript: false
publish: false
config:
labels:
mem1gb: "memory = 1000000000.B"
mem2gb: "memory = 2000000000.B"
mem5gb: "memory = 5000000000.B"
mem10gb: "memory = 10000000000.B"
mem20gb: "memory = 20000000000.B"
mem50gb: "memory = 50000000000.B"
mem100gb: "memory = 100000000000.B"
mem200gb: "memory = 200000000000.B"
mem500gb: "memory = 500000000000.B"
mem1tb: "memory = 1000000000000.B"
mem2tb: "memory = 2000000000000.B"
mem5tb: "memory = 5000000000000.B"
mem10tb: "memory = 10000000000000.B"
mem20tb: "memory = 20000000000000.B"
mem50tb: "memory = 50000000000000.B"
mem100tb: "memory = 100000000000000.B"
mem200tb: "memory = 200000000000000.B"
mem500tb: "memory = 500000000000000.B"
mem1gib: "memory = 1073741824.B"
mem2gib: "memory = 2147483648.B"
mem4gib: "memory = 4294967296.B"
mem8gib: "memory = 8589934592.B"
mem16gib: "memory = 17179869184.B"
mem32gib: "memory = 34359738368.B"
mem64gib: "memory = 68719476736.B"
mem128gib: "memory = 137438953472.B"
mem256gib: "memory = 274877906944.B"
mem512gib: "memory = 549755813888.B"
mem1tib: "memory = 1099511627776.B"
mem2tib: "memory = 2199023255552.B"
mem4tib: "memory = 4398046511104.B"
mem8tib: "memory = 8796093022208.B"
mem16tib: "memory = 17592186044416.B"
mem32tib: "memory = 35184372088832.B"
mem64tib: "memory = 70368744177664.B"
mem128tib: "memory = 140737488355328.B"
mem256tib: "memory = 281474976710656.B"
mem512tib: "memory = 562949953421312.B"
cpu1: "cpus = 1"
cpu2: "cpus = 2"
cpu5: "cpus = 5"
cpu10: "cpus = 10"
cpu20: "cpus = 20"
cpu50: "cpus = 50"
cpu100: "cpus = 100"
cpu200: "cpus = 200"
cpu500: "cpus = 500"
cpu1000: "cpus = 1000"
script:
- "includeConfig(\"nextflow_labels.config\")"
debug: false
container: "docker"
build_info:
config: "src/workflows/multiomics/split_modalities/config.vsh.yaml"
runner: "nextflow"
engine: "native"
output: "target/_private/nextflow/workflows/multiomics/split_modalities"
executable: "target/_private/nextflow/workflows/multiomics/split_modalities/main.nf"
viash_version: "0.9.4"
git_commit: "706b5ce24d313dcf947b7d9fe929630f1ad204e7"
git_remote: "https://github.com/openpipelines-bio/openpipeline"
git_tag: "2.1.0-2-g706b5ce24d3"
dependencies:
- "target/nextflow/dataflow/split_modalities"
package_config:
name: "openpipeline"
summary: "Best-practice workflows for single-cell multi-omics analyses.\n"
description: "OpenPipelines are extensible single cell analysis pipelines for reproducible\
\ and large-scale single cell processing using [Viash](https://viash.io) and [Nextflow](https://www.nextflow.io/).\n\
\nIn terms of workflows, the following has been made available, but keep in mind\
\ that\nindividual tools and functionality can be executed as standalone components\
\ as well.\n\n * Demultiplexing: conversion of raw sequencing data to FASTQ objects.\n\
\ * Ingestion: Read mapping and generating a count matrix.\n * Single sample\
\ processing: cell filtering and doublet detection.\n * Multisample processing:\
\ Count transformation, normalization, QC metric calulations.\n * Integration:\
\ Clustering, integration and batch correction using single and multimodal methods.\n\
\ * Downstream analysis workflows\n"
info:
test_resources:
- type: "s3"
path: "s3://openpipelines-data"
dest: "resources_test"
nextflow_labels_ci:
- path: "src/workflows/utils/labels_ci.config"
description: "Adds the correct memory and CPU labels when running on the Viash\
\ Hub CI."
viash_version: "0.9.4"
source: "src"
target: "target"
config_mods:
- ".resources += {path: '/src/workflows/utils/labels.config', dest: 'nextflow_labels.config'}\n\
.runners[.type == 'nextflow'].config.script := 'includeConfig(\"nextflow_labels.config\"\
)'"
- ".version := \"3.0.0\""
keywords:
- "single-cell"
- "multimodal"
license: "MIT"
organization: "openpipelines-bio"
links:
repository: "https://github.com/openpipelines-bio/openpipeline"
docker_registry: "ghcr.io"
homepage: "https://openpipelines.bio"
documentation: "https://openpipelines.bio/fundamentals"
issue_tracker: "https://github.com/openpipelines-bio/openpipeline/issues"

View File

@@ -0,0 +1,126 @@
manifest {
name = 'workflows/multiomics/split_modalities'
mainScript = 'main.nf'
nextflowVersion = '!>=20.12.1-edge'
version = '3.0.0'
description = 'A pipeline to split a multimodal mudata files into several unimodal mudata files.'
author = 'Dries Schaumont'
}
process.container = 'nextflow/bash:latest'
// detect tempdir
tempDir = java.nio.file.Paths.get(
System.getenv('NXF_TEMP') ?:
System.getenv('VIASH_TEMP') ?:
System.getenv('TEMPDIR') ?:
System.getenv('TMPDIR') ?:
'/tmp'
).toAbsolutePath()
profiles {
no_publish {
process {
withName: '.*' {
publishDir = [
enabled: false
]
}
}
}
mount_temp {
docker.temp = tempDir
podman.temp = tempDir
charliecloud.temp = tempDir
}
docker {
docker.enabled = true
// docker.userEmulation = true
singularity.enabled = false
podman.enabled = false
shifter.enabled = false
charliecloud.enabled = false
}
singularity {
singularity.enabled = true
singularity.autoMounts = true
docker.enabled = false
podman.enabled = false
shifter.enabled = false
charliecloud.enabled = false
}
podman {
podman.enabled = true
docker.enabled = false
singularity.enabled = false
shifter.enabled = false
charliecloud.enabled = false
}
shifter {
shifter.enabled = true
docker.enabled = false
singularity.enabled = false
podman.enabled = false
charliecloud.enabled = false
}
charliecloud {
charliecloud.enabled = true
docker.enabled = false
singularity.enabled = false
podman.enabled = false
shifter.enabled = false
}
}
process{
withLabel: mem1gb { memory = 1000000000.B }
withLabel: mem2gb { memory = 2000000000.B }
withLabel: mem5gb { memory = 5000000000.B }
withLabel: mem10gb { memory = 10000000000.B }
withLabel: mem20gb { memory = 20000000000.B }
withLabel: mem50gb { memory = 50000000000.B }
withLabel: mem100gb { memory = 100000000000.B }
withLabel: mem200gb { memory = 200000000000.B }
withLabel: mem500gb { memory = 500000000000.B }
withLabel: mem1tb { memory = 1000000000000.B }
withLabel: mem2tb { memory = 2000000000000.B }
withLabel: mem5tb { memory = 5000000000000.B }
withLabel: mem10tb { memory = 10000000000000.B }
withLabel: mem20tb { memory = 20000000000000.B }
withLabel: mem50tb { memory = 50000000000000.B }
withLabel: mem100tb { memory = 100000000000000.B }
withLabel: mem200tb { memory = 200000000000000.B }
withLabel: mem500tb { memory = 500000000000000.B }
withLabel: mem1gib { memory = 1073741824.B }
withLabel: mem2gib { memory = 2147483648.B }
withLabel: mem4gib { memory = 4294967296.B }
withLabel: mem8gib { memory = 8589934592.B }
withLabel: mem16gib { memory = 17179869184.B }
withLabel: mem32gib { memory = 34359738368.B }
withLabel: mem64gib { memory = 68719476736.B }
withLabel: mem128gib { memory = 137438953472.B }
withLabel: mem256gib { memory = 274877906944.B }
withLabel: mem512gib { memory = 549755813888.B }
withLabel: mem1tib { memory = 1099511627776.B }
withLabel: mem2tib { memory = 2199023255552.B }
withLabel: mem4tib { memory = 4398046511104.B }
withLabel: mem8tib { memory = 8796093022208.B }
withLabel: mem16tib { memory = 17592186044416.B }
withLabel: mem32tib { memory = 35184372088832.B }
withLabel: mem64tib { memory = 70368744177664.B }
withLabel: mem128tib { memory = 140737488355328.B }
withLabel: mem256tib { memory = 281474976710656.B }
withLabel: mem512tib { memory = 562949953421312.B }
withLabel: cpu1 { cpus = 1 }
withLabel: cpu2 { cpus = 2 }
withLabel: cpu5 { cpus = 5 }
withLabel: cpu10 { cpus = 10 }
withLabel: cpu20 { cpus = 20 }
withLabel: cpu50 { cpus = 50 }
withLabel: cpu100 { cpus = 100 }
withLabel: cpu200 { cpus = 200 }
withLabel: cpu500 { cpus = 500 }
withLabel: cpu1000 { cpus = 1000 }
}
includeConfig("nextflow_labels.config")

View File

@@ -0,0 +1,48 @@
process {
// Default resources for components that hardly do any processing
memory = { 2.GB * task.attempt }
cpus = 1
// Retry for exit codes that have something to do with memory issues
errorStrategy = { task.exitStatus in 137..140 ? 'retry' : 'terminate' }
maxRetries = 3
// The memory a task is assinged increases with each attempt
// uncomment the line below and adjust the value to set a global upper limit on the memory.
// resourceLimits = [ memory: 240.Gb ]
// CPU resources
withLabel: singlecpu { cpus = 1 }
withLabel: lowcpu { cpus = 4 }
withLabel: midcpu { cpus = 10 }
withLabel: highcpu { cpus = 20 }
// Memory resources
withLabel: lowmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 4.GB * task.attempt } }
withLabel: midmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 25.GB * task.attempt } }
withLabel: highmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 50.GB * task.attempt } }
withLabel: veryhighmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 75.GB * task.attempt } }
// Disk space
withLabel: lowdisk {
disk = {process.disk ? process.disk : null}
}
withLabel: middisk {
disk = {process.disk ? process.disk : null}
}
withLabel: highdisk {
disk = {process.disk ? process.disk : null}
}
withLabel: veryhighdisk {
disk = {process.disk ? process.disk : null}
}
// NOTE: The above labels intentionally do not have an effect by default.
// The user should set the disk space requirements by adding the following
// to the compute environment:
//
// withLabel: lowdisk { disk = { 20.GB * task.attempt } }
// withLabel: middisk { disk = { 100.GB * task.attempt } }
// withLabel: highdisk { disk = { 200.GB * task.attempt } }
// withLabel: veryhighdisk { disk = { 500.GB * task.attempt } }
}

View File

@@ -0,0 +1,36 @@
profiles {
// detect tempdir
tempDir = java.nio.file.Paths.get(
System.getenv('NXF_TEMP') ?:
System.getenv('VIASH_TEMP') ?:
System.getenv('TEMPDIR') ?:
System.getenv('TMPDIR') ?:
'/tmp'
).toAbsolutePath()
mount_temp {
docker.temp = tempDir
podman.temp = tempDir
charliecloud.temp = tempDir
}
no_publish {
process {
withName: '.*' {
publishDir = [
enabled: false
]
}
}
}
docker {
docker.enabled = true
// docker.userEmulation = true
singularity.enabled = false
podman.enabled = false
shifter.enabled = false
charliecloud.enabled = false
}
}

View File

@@ -0,0 +1,48 @@
process {
// Default resources for components that hardly do any processing
memory = { 2.GB * task.attempt }
cpus = 1
// Retry for exit codes that have something to do with memory issues
errorStrategy = { task.exitStatus in 137..140 ? 'retry' : 'terminate' }
maxRetries = 3
// The memory a task is assinged increases with each attempt
// uncomment the line below and adjust the value to set a global upper limit on the memory.
// resourceLimits = [ memory: 240.Gb ]
// CPU resources
withLabel: singlecpu { cpus = 1 }
withLabel: lowcpu { cpus = 4 }
withLabel: midcpu { cpus = 10 }
withLabel: highcpu { cpus = 20 }
// Memory resources
withLabel: lowmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 4.GB * task.attempt } }
withLabel: midmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 25.GB * task.attempt } }
withLabel: highmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 50.GB * task.attempt } }
withLabel: veryhighmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 75.GB * task.attempt } }
// Disk space
withLabel: lowdisk {
disk = {process.disk ? process.disk : null}
}
withLabel: middisk {
disk = {process.disk ? process.disk : null}
}
withLabel: highdisk {
disk = {process.disk ? process.disk : null}
}
withLabel: veryhighdisk {
disk = {process.disk ? process.disk : null}
}
// NOTE: The above labels intentionally do not have an effect by default.
// The user should set the disk space requirements by adding the following
// to the compute environment:
//
// withLabel: lowdisk { disk = { 20.GB * task.attempt } }
// withLabel: middisk { disk = { 100.GB * task.attempt } }
// withLabel: highdisk { disk = { 200.GB * task.attempt } }
// withLabel: veryhighdisk { disk = { 500.GB * task.attempt } }
}

View File

@@ -0,0 +1,33 @@
process {
withLabel: lowmem { memory = 13.Gb }
withLabel: lowcpu { cpus = 4 }
withLabel: midmem { memory = 13.Gb }
withLabel: midcpu { cpus = 4 }
withLabel: highmem { memory = 13.Gb }
withLabel: highcpu { cpus = 4 }
withLabel: veryhighmem { memory = 13.Gb }
withLabel: lowdisk {
disk = {process.disk ? process.disk : null}
}
withLabel: middisk {
disk = {process.disk ? process.disk : null}
}
withLabel: highdisk {
disk = {process.disk ? process.disk : null}
}
withLabel: veryhighdisk {
disk = {process.disk ? process.disk : null}
}
}
env.NUMBA_CACHE_DIR = '/tmp'
trace {
enabled = true
overwrite = true
}
dag {
overwrite = true
}
process.maxForks = 1

View File

@@ -0,0 +1,466 @@
name: "celltypist"
namespace: "annotate"
version: "3.0.0"
authors:
- name: "Jakub Majercik"
roles:
- "author"
info:
role: "Contributor"
links:
email: "jakub@data-intuitive.com"
github: "jakubmajercik"
linkedin: "jakubmajercik"
organizations:
- name: "Data Intuitive"
href: "https://www.data-intuitive.com"
role: "Bioinformatics Engineer"
- name: "Weiwei Schultz"
roles:
- "contributor"
info:
role: "Contributor"
organizations:
- name: "Janssen R&D US"
role: "Associate Director Data Sciences"
argument_groups:
- name: "Inputs"
description: "Input dataset (query) arguments"
arguments:
- type: "file"
name: "--input"
alternatives:
- "-i"
description: "The input (query) data to be labeled. Should be a .h5mu file."
info: null
example:
- "input.h5mu"
must_exist: true
create_parent: true
required: true
direction: "input"
multiple: false
multiple_sep: ";"
- type: "string"
name: "--modality"
description: "Which modality to process."
info: null
default:
- "rna"
required: false
direction: "input"
multiple: false
multiple_sep: ";"
- type: "string"
name: "--input_layer"
description: "The layer in the input data containing log normalized counts to\
\ be used for cell type annotation if .X is not to be used."
info: null
required: false
direction: "input"
multiple: false
multiple_sep: ";"
- type: "string"
name: "--input_var_gene_names"
description: "The name of the adata var column in the input data containing gene\
\ names; when no gene_name_layer is provided, the var index will be used.\n"
info: null
required: false
direction: "input"
multiple: false
multiple_sep: ";"
- type: "integer"
name: "--input_reference_gene_overlap"
description: "The minimum number of genes present in both the reference and query\
\ datasets.\n"
info: null
default:
- 100
required: false
min: 1
direction: "input"
multiple: false
multiple_sep: ";"
- name: "Reference"
description: "Arguments related to the reference dataset."
arguments:
- type: "file"
name: "--reference"
description: "The reference data to train the CellTypist classifiers on. Only\
\ required if a pre-trained --model is not provided."
info: null
example:
- "reference.h5mu"
must_exist: true
create_parent: true
required: false
direction: "input"
multiple: false
multiple_sep: ";"
- type: "string"
name: "--reference_layer"
description: "The layer in the reference data to be used for cell type annotation\
\ if .X is not to be used. Data are expected to be processed in the same way\
\ as the --input query dataset."
info: null
required: false
direction: "input"
multiple: false
multiple_sep: ";"
- type: "string"
name: "--reference_obs_target"
description: "The name of the adata obs column in the reference data containing\
\ cell type annotations."
info: null
default:
- "cell_ontology_class"
required: false
direction: "input"
multiple: false
multiple_sep: ";"
- type: "string"
name: "--reference_var_gene_names"
description: "The name of the adata var column in the reference data containing\
\ gene names; when no gene_name_layer is provided, the var index will be used.\n"
info: null
required: false
direction: "input"
multiple: false
multiple_sep: ";"
- type: "string"
name: "--reference_var_input"
description: ".var column containing highly variable genes. By default, do not\
\ subset genes.\n"
info: null
required: false
direction: "input"
multiple: false
multiple_sep: ";"
- name: "Model arguments"
description: "Model arguments."
arguments:
- type: "file"
name: "--model"
description: "Pretrained model in pkl format. If not provided, the model will\
\ be trained on the reference data and --reference should be provided."
info: null
example:
- "pretrained_model.pkl"
must_exist: true
create_parent: true
required: false
direction: "input"
multiple: false
multiple_sep: ";"
- type: "boolean"
name: "--feature_selection"
description: "Whether to perform feature selection."
info: null
default:
- false
required: false
direction: "input"
multiple: false
multiple_sep: ";"
- type: "boolean"
name: "--majority_voting"
description: "Whether to refine the predicted labels by running the majority voting\
\ classifier after over-clustering."
info: null
default:
- false
required: false
direction: "input"
multiple: false
multiple_sep: ";"
- type: "double"
name: "--C"
description: "Inverse of regularization strength in logistic regression."
info: null
default:
- 1.0
required: false
direction: "input"
multiple: false
multiple_sep: ";"
- type: "integer"
name: "--max_iter"
description: "Maximum number of iterations before reaching the minimum of the\
\ cost function."
info: null
default:
- 1000
required: false
direction: "input"
multiple: false
multiple_sep: ";"
- type: "boolean_true"
name: "--use_SGD"
description: "Whether to use the stochastic gradient descent algorithm."
info: null
direction: "input"
- type: "double"
name: "--min_prop"
description: "\"For the dominant cell type within a subcluster, the minimum proportion\
\ of cells required to \nsupport naming of the subcluster by this cell type.\
\ Ignored if majority_voting is set to False. \nSubcluster that fails to pass\
\ this proportion threshold will be assigned 'Heterogeneous'.\"\n"
info: null
default:
- 0.0
required: false
direction: "input"
multiple: false
multiple_sep: ";"
- name: "Outputs"
description: "Output arguments."
arguments:
- type: "file"
name: "--output"
description: "Output h5mu file."
info: null
example:
- "output.h5mu"
must_exist: true
create_parent: true
required: false
direction: "output"
multiple: false
multiple_sep: ";"
- type: "string"
name: "--output_obs_predictions"
description: "In which `.obs` slots to store the predicted information.\n"
info: null
default:
- "celltypist_pred"
required: false
direction: "input"
multiple: false
multiple_sep: ";"
- type: "string"
name: "--output_obs_probability"
description: "In which `.obs` slots to store the probability of the predictions.\n"
info: null
default:
- "celltypist_probability"
required: false
direction: "input"
multiple: false
multiple_sep: ";"
- type: "string"
name: "--output_compression"
description: "Compression format to use for the output AnnData and/or Mudata objects.\n\
By default no compression is applied.\n"
info: null
example:
- "gzip"
required: false
choices:
- "gzip"
- "lzf"
direction: "input"
multiple: false
multiple_sep: ";"
resources:
- type: "python_script"
path: "script.py"
is_executable: true
- type: "file"
path: "setup_logger.py"
- type: "file"
path: "cross_check_genes.py"
- type: "file"
path: "subset_vars.py"
- type: "file"
path: "set_var_index.py"
- type: "file"
path: "nextflow_labels.config"
dest: "nextflow_labels.config"
description: "Automated cell type annotation tool for scRNA-seq datasets on the basis\
\ of logistic regression classifiers optimised by the stochastic gradient descent\
\ algorithm."
test_resources:
- type: "python_script"
path: "test.py"
is_executable: true
- type: "file"
path: "annotation_test_data"
- type: "file"
path: "pbmc_1k_protein_v3"
info: null
status: "enabled"
scope:
image: "public"
target: "public"
license: "MIT"
links:
repository: "https://github.com/openpipelines-bio/openpipeline"
docker_registry: "ghcr.io"
runners:
- type: "executable"
id: "executable"
docker_setup_strategy: "ifneedbepullelsecachedbuild"
- type: "nextflow"
id: "nextflow"
directives:
label:
- "highcpu"
- "highmem"
- "highdisk"
tag: "$id"
auto:
simplifyInput: true
simplifyOutput: false
transcript: false
publish: false
config:
labels:
mem1gb: "memory = 1000000000.B"
mem2gb: "memory = 2000000000.B"
mem5gb: "memory = 5000000000.B"
mem10gb: "memory = 10000000000.B"
mem20gb: "memory = 20000000000.B"
mem50gb: "memory = 50000000000.B"
mem100gb: "memory = 100000000000.B"
mem200gb: "memory = 200000000000.B"
mem500gb: "memory = 500000000000.B"
mem1tb: "memory = 1000000000000.B"
mem2tb: "memory = 2000000000000.B"
mem5tb: "memory = 5000000000000.B"
mem10tb: "memory = 10000000000000.B"
mem20tb: "memory = 20000000000000.B"
mem50tb: "memory = 50000000000000.B"
mem100tb: "memory = 100000000000000.B"
mem200tb: "memory = 200000000000000.B"
mem500tb: "memory = 500000000000000.B"
mem1gib: "memory = 1073741824.B"
mem2gib: "memory = 2147483648.B"
mem4gib: "memory = 4294967296.B"
mem8gib: "memory = 8589934592.B"
mem16gib: "memory = 17179869184.B"
mem32gib: "memory = 34359738368.B"
mem64gib: "memory = 68719476736.B"
mem128gib: "memory = 137438953472.B"
mem256gib: "memory = 274877906944.B"
mem512gib: "memory = 549755813888.B"
mem1tib: "memory = 1099511627776.B"
mem2tib: "memory = 2199023255552.B"
mem4tib: "memory = 4398046511104.B"
mem8tib: "memory = 8796093022208.B"
mem16tib: "memory = 17592186044416.B"
mem32tib: "memory = 35184372088832.B"
mem64tib: "memory = 70368744177664.B"
mem128tib: "memory = 140737488355328.B"
mem256tib: "memory = 281474976710656.B"
mem512tib: "memory = 562949953421312.B"
cpu1: "cpus = 1"
cpu2: "cpus = 2"
cpu5: "cpus = 5"
cpu10: "cpus = 10"
cpu20: "cpus = 20"
cpu50: "cpus = 50"
cpu100: "cpus = 100"
cpu200: "cpus = 200"
cpu500: "cpus = 500"
cpu1000: "cpus = 1000"
script:
- "includeConfig(\"nextflow_labels.config\")"
debug: false
container: "docker"
engines:
- type: "docker"
id: "docker"
image: "python:3.10-slim"
target_tag: "3.0.0"
namespace_separator: "/"
setup:
- type: "apt"
packages:
- "libhdf5-dev"
- "procps"
interactive: false
- type: "python"
user: false
packages:
- "scanpy~=1.10.4"
upgrade: true
- type: "python"
user: false
packages:
- "celltypist==1.6.3"
upgrade: true
- type: "python"
user: false
packages:
- "anndata~=0.11.1"
- "mudata~=0.3.1"
script:
- "exec(\"try:\\n import awkward\\nexcept ModuleNotFoundError:\\n exit(0)\\\
nelse: exit(1)\")"
upgrade: true
test_setup:
- type: "apt"
packages:
- "git"
interactive: false
- type: "python"
user: false
packages:
- "viashpy==0.8.0"
github:
- "openpipelines-bio/core#subdirectory=packages/python/openpipeline_testutils"
upgrade: true
entrypoint: []
cmd: null
build_info:
config: "src/annotate/celltypist/config.vsh.yaml"
runner: "nextflow"
engine: "docker"
output: "target/nextflow/annotate/celltypist"
executable: "target/nextflow/annotate/celltypist/main.nf"
viash_version: "0.9.4"
git_commit: "706b5ce24d313dcf947b7d9fe929630f1ad204e7"
git_remote: "https://github.com/openpipelines-bio/openpipeline"
git_tag: "2.1.0-2-g706b5ce24d3"
package_config:
name: "openpipeline"
summary: "Best-practice workflows for single-cell multi-omics analyses.\n"
description: "OpenPipelines are extensible single cell analysis pipelines for reproducible\
\ and large-scale single cell processing using [Viash](https://viash.io) and [Nextflow](https://www.nextflow.io/).\n\
\nIn terms of workflows, the following has been made available, but keep in mind\
\ that\nindividual tools and functionality can be executed as standalone components\
\ as well.\n\n * Demultiplexing: conversion of raw sequencing data to FASTQ objects.\n\
\ * Ingestion: Read mapping and generating a count matrix.\n * Single sample\
\ processing: cell filtering and doublet detection.\n * Multisample processing:\
\ Count transformation, normalization, QC metric calulations.\n * Integration:\
\ Clustering, integration and batch correction using single and multimodal methods.\n\
\ * Downstream analysis workflows\n"
info:
test_resources:
- type: "s3"
path: "s3://openpipelines-data"
dest: "resources_test"
nextflow_labels_ci:
- path: "src/workflows/utils/labels_ci.config"
description: "Adds the correct memory and CPU labels when running on the Viash\
\ Hub CI."
viash_version: "0.9.4"
source: "src"
target: "target"
config_mods:
- ".resources += {path: '/src/workflows/utils/labels.config', dest: 'nextflow_labels.config'}\n\
.runners[.type == 'nextflow'].config.script := 'includeConfig(\"nextflow_labels.config\"\
)'"
- ".version := \"3.0.0\""
- ".engines[.type == 'docker'].target_tag := '3.0.0'"
keywords:
- "single-cell"
- "multimodal"
license: "MIT"
organization: "openpipelines-bio"
links:
repository: "https://github.com/openpipelines-bio/openpipeline"
docker_registry: "ghcr.io"
homepage: "https://openpipelines.bio"
documentation: "https://openpipelines.bio/fundamentals"
issue_tracker: "https://github.com/openpipelines-bio/openpipeline/issues"

View File

@@ -0,0 +1,26 @@
from typing import List
def cross_check_genes(
query_genes: List[str], reference_genes: List[str], min_gene_overlap: int = 100
) -> List[str]:
"""Cross check the overlap between two lists of genes
Parameters
----------
query_genes : List[str]
List of gene names
reference_genes : List[str]
List of gene names
Returns
-------
List[str]
List of overlapping genes
"""
common_ens_ids = list(set(reference_genes).intersection(set(query_genes)))
assert len(common_ens_ids) >= min_gene_overlap, (
f"The intersection of genes between the query and reference dataset is too small, expected at least {min_gene_overlap}."
)
return common_ens_ids

View File

@@ -0,0 +1,126 @@
manifest {
name = 'annotate/celltypist'
mainScript = 'main.nf'
nextflowVersion = '!>=20.12.1-edge'
version = '3.0.0'
description = 'Automated cell type annotation tool for scRNA-seq datasets on the basis of logistic regression classifiers optimised by the stochastic gradient descent algorithm.'
author = 'Jakub Majercik, Weiwei Schultz'
}
process.container = 'nextflow/bash:latest'
// detect tempdir
tempDir = java.nio.file.Paths.get(
System.getenv('NXF_TEMP') ?:
System.getenv('VIASH_TEMP') ?:
System.getenv('TEMPDIR') ?:
System.getenv('TMPDIR') ?:
'/tmp'
).toAbsolutePath()
profiles {
no_publish {
process {
withName: '.*' {
publishDir = [
enabled: false
]
}
}
}
mount_temp {
docker.temp = tempDir
podman.temp = tempDir
charliecloud.temp = tempDir
}
docker {
docker.enabled = true
// docker.userEmulation = true
singularity.enabled = false
podman.enabled = false
shifter.enabled = false
charliecloud.enabled = false
}
singularity {
singularity.enabled = true
singularity.autoMounts = true
docker.enabled = false
podman.enabled = false
shifter.enabled = false
charliecloud.enabled = false
}
podman {
podman.enabled = true
docker.enabled = false
singularity.enabled = false
shifter.enabled = false
charliecloud.enabled = false
}
shifter {
shifter.enabled = true
docker.enabled = false
singularity.enabled = false
podman.enabled = false
charliecloud.enabled = false
}
charliecloud {
charliecloud.enabled = true
docker.enabled = false
singularity.enabled = false
podman.enabled = false
shifter.enabled = false
}
}
process{
withLabel: mem1gb { memory = 1000000000.B }
withLabel: mem2gb { memory = 2000000000.B }
withLabel: mem5gb { memory = 5000000000.B }
withLabel: mem10gb { memory = 10000000000.B }
withLabel: mem20gb { memory = 20000000000.B }
withLabel: mem50gb { memory = 50000000000.B }
withLabel: mem100gb { memory = 100000000000.B }
withLabel: mem200gb { memory = 200000000000.B }
withLabel: mem500gb { memory = 500000000000.B }
withLabel: mem1tb { memory = 1000000000000.B }
withLabel: mem2tb { memory = 2000000000000.B }
withLabel: mem5tb { memory = 5000000000000.B }
withLabel: mem10tb { memory = 10000000000000.B }
withLabel: mem20tb { memory = 20000000000000.B }
withLabel: mem50tb { memory = 50000000000000.B }
withLabel: mem100tb { memory = 100000000000000.B }
withLabel: mem200tb { memory = 200000000000000.B }
withLabel: mem500tb { memory = 500000000000000.B }
withLabel: mem1gib { memory = 1073741824.B }
withLabel: mem2gib { memory = 2147483648.B }
withLabel: mem4gib { memory = 4294967296.B }
withLabel: mem8gib { memory = 8589934592.B }
withLabel: mem16gib { memory = 17179869184.B }
withLabel: mem32gib { memory = 34359738368.B }
withLabel: mem64gib { memory = 68719476736.B }
withLabel: mem128gib { memory = 137438953472.B }
withLabel: mem256gib { memory = 274877906944.B }
withLabel: mem512gib { memory = 549755813888.B }
withLabel: mem1tib { memory = 1099511627776.B }
withLabel: mem2tib { memory = 2199023255552.B }
withLabel: mem4tib { memory = 4398046511104.B }
withLabel: mem8tib { memory = 8796093022208.B }
withLabel: mem16tib { memory = 17592186044416.B }
withLabel: mem32tib { memory = 35184372088832.B }
withLabel: mem64tib { memory = 70368744177664.B }
withLabel: mem128tib { memory = 140737488355328.B }
withLabel: mem256tib { memory = 281474976710656.B }
withLabel: mem512tib { memory = 562949953421312.B }
withLabel: cpu1 { cpus = 1 }
withLabel: cpu2 { cpus = 2 }
withLabel: cpu5 { cpus = 5 }
withLabel: cpu10 { cpus = 10 }
withLabel: cpu20 { cpus = 20 }
withLabel: cpu50 { cpus = 50 }
withLabel: cpu100 { cpus = 100 }
withLabel: cpu200 { cpus = 200 }
withLabel: cpu500 { cpus = 500 }
withLabel: cpu1000 { cpus = 1000 }
}
includeConfig("nextflow_labels.config")

View File

@@ -0,0 +1,48 @@
process {
// Default resources for components that hardly do any processing
memory = { 2.GB * task.attempt }
cpus = 1
// Retry for exit codes that have something to do with memory issues
errorStrategy = { task.exitStatus in 137..140 ? 'retry' : 'terminate' }
maxRetries = 3
// The memory a task is assinged increases with each attempt
// uncomment the line below and adjust the value to set a global upper limit on the memory.
// resourceLimits = [ memory: 240.Gb ]
// CPU resources
withLabel: singlecpu { cpus = 1 }
withLabel: lowcpu { cpus = 4 }
withLabel: midcpu { cpus = 10 }
withLabel: highcpu { cpus = 20 }
// Memory resources
withLabel: lowmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 4.GB * task.attempt } }
withLabel: midmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 25.GB * task.attempt } }
withLabel: highmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 50.GB * task.attempt } }
withLabel: veryhighmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 75.GB * task.attempt } }
// Disk space
withLabel: lowdisk {
disk = {process.disk ? process.disk : null}
}
withLabel: middisk {
disk = {process.disk ? process.disk : null}
}
withLabel: highdisk {
disk = {process.disk ? process.disk : null}
}
withLabel: veryhighdisk {
disk = {process.disk ? process.disk : null}
}
// NOTE: The above labels intentionally do not have an effect by default.
// The user should set the disk space requirements by adding the following
// to the compute environment:
//
// withLabel: lowdisk { disk = { 20.GB * task.attempt } }
// withLabel: middisk { disk = { 100.GB * task.attempt } }
// withLabel: highdisk { disk = { 200.GB * task.attempt } }
// withLabel: veryhighdisk { disk = { 500.GB * task.attempt } }
}

View File

@@ -0,0 +1,34 @@
# Inputs
input: # please fill in - example: "input.h5mu"
modality: "rna"
# input_layer: "foo"
# input_var_gene_names: "foo"
input_reference_gene_overlap: 100
# Reference
# reference: "reference.h5mu"
# reference_layer: "foo"
reference_obs_target: "cell_ontology_class"
# reference_var_gene_names: "foo"
# reference_var_input: "foo"
# Model arguments
# model: "pretrained_model.pkl"
feature_selection: false
majority_voting: false
C: 1.0
max_iter: 1000
use_SGD: false
min_prop: 0.0
# Outputs
# output: "$id.$key.output.h5mu"
output_obs_predictions: "celltypist_pred"
output_obs_probability: "celltypist_probability"
# output_compression: "gzip"
# Nextflow input-output arguments
publish_dir: # please fill in - example: "output/"
# param_list: "my_params.yaml"
# Arguments

View File

@@ -0,0 +1,205 @@
{
"$schema": "https://json-schema.org/draft/2020-12/schema",
"title": "celltypist",
"description": "Automated cell type annotation tool for scRNA-seq datasets on the basis of logistic regression classifiers optimised by the stochastic gradient descent algorithm.",
"type": "object",
"$defs": {
"Dataset input": {
"title": "Dataset input",
"type": "object",
"description": "Dataset input using nf-tower \"dataset\" or \"data explorer\". Allows for the input of multiple parameter sets to initialise a Nextflow channel.",
"properties": {
"param_list": {
"description": "Dataset input can either be a list of maps, a csv file, a json file, a yaml file, or simply a yaml blob. The names of the input fields (e.g. csv columns, json keys) need to be an exact match with the workflow input parameters.",
"type": "string",
"default": "",
"format": "file-path",
"mimetype": "text/csv"
}
}
},
"inputs": {
"title": "Inputs",
"type": "object",
"description": "Input dataset (query) arguments",
"properties": {
"input": {
"type": "string",
"format": "path",
"exists": true,
"description": "The input (query) data to be labeled",
"help_text": "Type: `file`, multiple: `False`, required, direction: `input`, example: `\"input.h5mu\"`. "
},
"modality": {
"type": "string",
"description": "Which modality to process.",
"help_text": "Type: `string`, multiple: `False`, default: `\"rna\"`. ",
"default": "rna"
},
"input_layer": {
"type": "string",
"description": "The layer in the input data containing log normalized counts to be used for cell type annotation if .X is not to be used.",
"help_text": "Type: `string`, multiple: `False`. "
},
"input_var_gene_names": {
"type": "string",
"description": "The name of the adata var column in the input data containing gene names; when no gene_name_layer is provided, the var index will be used.\n",
"help_text": "Type: `string`, multiple: `False`. "
},
"input_reference_gene_overlap": {
"type": "integer",
"description": "The minimum number of genes present in both the reference and query datasets.\n",
"help_text": "Type: `integer`, multiple: `False`, default: `100`. ",
"default": 100
}
}
},
"outputs": {
"title": "Outputs",
"type": "object",
"description": "Output arguments.",
"properties": {
"output": {
"type": "string",
"format": "path",
"description": "Output h5mu file.",
"help_text": "Type: `file`, multiple: `False`, default: `\"$id.$key.output.h5mu\"`, direction: `output`, example: `\"output.h5mu\"`. ",
"default": "$id.$key.output.h5mu"
},
"output_obs_predictions": {
"type": "string",
"description": "In which `.obs` slots to store the predicted information.\n",
"help_text": "Type: `string`, multiple: `False`, default: `\"celltypist_pred\"`. ",
"default": "celltypist_pred"
},
"output_obs_probability": {
"type": "string",
"description": "In which `.obs` slots to store the probability of the predictions.\n",
"help_text": "Type: `string`, multiple: `False`, default: `\"celltypist_probability\"`. ",
"default": "celltypist_probability"
},
"output_compression": {
"type": "string",
"description": "Compression format to use for the output AnnData and/or Mudata objects.\nBy default no compression is applied.\n",
"help_text": "Type: `string`, multiple: `False`, example: `\"gzip\"`, choices: ``gzip`, `lzf``. ",
"enum": [
"gzip",
"lzf"
]
}
}
},
"reference": {
"title": "Reference",
"type": "object",
"description": "Arguments related to the reference dataset.",
"properties": {
"reference": {
"type": "string",
"format": "path",
"description": "The reference data to train the CellTypist classifiers on",
"help_text": "Type: `file`, multiple: `False`, direction: `input`, example: `\"reference.h5mu\"`. "
},
"reference_layer": {
"type": "string",
"description": "The layer in the reference data to be used for cell type annotation if .X is not to be used",
"help_text": "Type: `string`, multiple: `False`. "
},
"reference_obs_target": {
"type": "string",
"description": "The name of the adata obs column in the reference data containing cell type annotations.",
"help_text": "Type: `string`, multiple: `False`, default: `\"cell_ontology_class\"`. ",
"default": "cell_ontology_class"
},
"reference_var_gene_names": {
"type": "string",
"description": "The name of the adata var column in the reference data containing gene names; when no gene_name_layer is provided, the var index will be used.\n",
"help_text": "Type: `string`, multiple: `False`. "
},
"reference_var_input": {
"type": "string",
"description": ".var column containing highly variable genes",
"help_text": "Type: `string`, multiple: `False`. "
}
}
},
"model arguments": {
"title": "Model arguments",
"type": "object",
"description": "Model arguments.",
"properties": {
"model": {
"type": "string",
"format": "path",
"description": "Pretrained model in pkl format",
"help_text": "Type: `file`, multiple: `False`, direction: `input`, example: `\"pretrained_model.pkl\"`. "
},
"feature_selection": {
"type": "boolean",
"description": "Whether to perform feature selection.",
"help_text": "Type: `boolean`, multiple: `False`, default: `false`. ",
"default": false
},
"majority_voting": {
"type": "boolean",
"description": "Whether to refine the predicted labels by running the majority voting classifier after over-clustering.",
"help_text": "Type: `boolean`, multiple: `False`, default: `false`. ",
"default": false
},
"C": {
"type": "number",
"description": "Inverse of regularization strength in logistic regression.",
"help_text": "Type: `double`, multiple: `False`, default: `1.0`. ",
"default": 1.0
},
"max_iter": {
"type": "integer",
"description": "Maximum number of iterations before reaching the minimum of the cost function.",
"help_text": "Type: `integer`, multiple: `False`, default: `1000`. ",
"default": 1000
},
"use_SGD": {
"type": "boolean",
"description": "Whether to use the stochastic gradient descent algorithm.",
"help_text": "Type: `boolean_true`, multiple: `False`, default: `false`. ",
"default": false
},
"min_prop": {
"type": "number",
"description": "\"For the dominant cell type within a subcluster, the minimum proportion of cells required to \nsupport naming of the subcluster by this cell type",
"help_text": "Type: `double`, multiple: `False`, default: `0.0`. ",
"default": 0.0
}
}
},
"nextflow input-output arguments": {
"title": "Nextflow input-output arguments",
"type": "object",
"description": "Input/output parameters for Nextflow itself. Please note that both publishDir and publish_dir are supported but at least one has to be configured.",
"properties": {
"publish_dir": {
"type": "string",
"description": "Path to an output directory.",
"help_text": "Type: `string`, multiple: `False`, required, example: `\"output/\"`. "
}
}
}
},
"allOf": [
{
"$ref": "#/$defs/inputs"
},
{
"$ref": "#/$defs/outputs"
},
{
"$ref": "#/$defs/reference"
},
{
"$ref": "#/$defs/model arguments"
},
{
"$ref": "#/$defs/nextflow input-output arguments"
}
]
}

View File

@@ -0,0 +1,24 @@
import anndata as ad
import re
def set_var_index(adata: ad.AnnData, var_name: str | None = None) -> ad.AnnData:
"""Sanitize gene names and set the index of the .var DataFrame.
Parameters
----------
adata : AnnData
Annotated data object
var_name : str | None
Name of the column in `adata.var` that contains the gene names, if None, the existing index will be sanitized but not replaced.
Returns
-------
AnnData
Copy of `adata` with sanitized and replaced index
"""
if var_name:
adata.var.index = [re.sub("\\.[0-9]+$", "", s) for s in adata.var[var_name]]
else:
adata.var.index = [re.sub("\\.[0-9]+$", "", s) for s in adata.var.index]
return adata

View File

@@ -0,0 +1,12 @@
def setup_logger():
import logging
from sys import stdout
logger = logging.getLogger()
logger.setLevel(logging.INFO)
console_handler = logging.StreamHandler(stdout)
logFormatter = logging.Formatter("%(asctime)s %(levelname)-8s %(message)s")
console_handler.setFormatter(logFormatter)
logger.addHandler(console_handler)
return logger

View File

@@ -0,0 +1,31 @@
def subset_vars(adata, subset_col):
"""Subset AnnData object on highly variable genes
Parameters
----------
adata : AnnData
Annotated data object
subset_col : str
Name of the boolean column in `adata.var` that contains the information if features should be used or not
Returns
-------
AnnData
Copy of `adata` with subsetted features
"""
if subset_col not in adata.var.columns:
raise ValueError(
f"Requested to use .var column '{subset_col}' as a selection of genes, but the column is not available."
)
if adata.var[subset_col].dtype == "boolean":
assert adata.var[subset_col].isna().sum() == 0, (
f"The .var column `{subset_col}` contains NaN values. Can not subset data."
)
adata.var[subset_col] = adata.var[subset_col].astype("bool")
assert adata.var[subset_col].dtype == "bool", (
f"Expected dtype of .var column '{subset_col}' to be `bool`, but found {adata.var[subset_col].dtype}. Can not subset data."
)
return adata[:, adata.var[subset_col]].copy()

View File

@@ -0,0 +1,475 @@
name: "scanvi"
namespace: "annotate"
version: "3.0.0"
authors:
- name: "Dorien Roosen"
roles:
- "maintainer"
info:
role: "Core Team Member"
links:
email: "dorien@data-intuitive.com"
github: "dorien-er"
linkedin: "dorien-roosen"
organizations:
- name: "Data Intuitive"
href: "https://www.data-intuitive.com"
role: "Data Scientist"
- name: "Jakub Majercik"
roles:
- "author"
info:
role: "Contributor"
links:
email: "jakub@data-intuitive.com"
github: "jakubmajercik"
linkedin: "jakubmajercik"
organizations:
- name: "Data Intuitive"
href: "https://www.data-intuitive.com"
role: "Bioinformatics Engineer"
- name: "Weiwei Schultz"
roles:
- "contributor"
info:
role: "Contributor"
organizations:
- name: "Janssen R&D US"
role: "Associate Director Data Sciences"
argument_groups:
- name: "Inputs"
arguments:
- type: "file"
name: "--input"
alternatives:
- "-i"
description: "Input h5mu file. Note that this needs to be the exact same dataset\
\ as the --scvi_model was trained on."
info: null
must_exist: true
create_parent: true
required: true
direction: "input"
multiple: false
multiple_sep: ";"
- type: "string"
name: "--modality"
description: "Which modality from the input MuData file to process.\n"
info: null
default:
- "rna"
required: false
direction: "input"
multiple: false
multiple_sep: ";"
- type: "string"
name: "--input_layer"
description: "Input layer to use. If None, X is used"
info: null
required: false
direction: "input"
multiple: false
multiple_sep: ";"
- type: "string"
name: "--var_input"
description: ".var column containing highly variable genes that were used to train\
\ the scVi model. By default, do not subset genes."
info: null
required: false
direction: "input"
multiple: false
multiple_sep: ";"
- type: "string"
name: "--var_gene_names"
description: ".var column containing gene names. By default, use the index."
info: null
required: false
direction: "input"
multiple: false
multiple_sep: ";"
- type: "string"
name: "--obs_labels"
description: ".obs field containing the labels"
info: null
required: true
direction: "input"
multiple: false
multiple_sep: ";"
- type: "string"
name: "--unlabeled_category"
description: "Value in the --obs_labels field that indicates unlabeled observations\n"
info: null
default:
- "Unknown"
required: false
direction: "input"
multiple: false
multiple_sep: ";"
- name: "scVI Model"
arguments:
- type: "file"
name: "--scvi_model"
description: "Pretrained SCVI reference model to initialize the SCANVI model with."
info: null
example:
- "scvi_model.pt"
must_exist: true
create_parent: true
required: true
direction: "input"
multiple: false
multiple_sep: ";"
- name: "Outputs"
arguments:
- type: "file"
name: "--output"
alternatives:
- "-o"
description: "Output h5mu file."
info: null
must_exist: true
create_parent: true
required: true
direction: "output"
multiple: false
multiple_sep: ";"
- type: "file"
name: "--output_model"
description: "Folder where the state of the trained model will be saved to."
info: null
must_exist: true
create_parent: true
required: false
direction: "output"
multiple: false
multiple_sep: ";"
- type: "string"
name: "--obsm_output"
description: "In which .obsm slot to store the resulting integrated embedding."
info: null
default:
- "X_scanvi_integrated"
required: false
direction: "input"
multiple: false
multiple_sep: ";"
- type: "string"
name: "--obs_output_predictions"
description: "In which .obs slot to store the predicted labels."
info: null
default:
- "scanvi_pred"
required: false
direction: "input"
multiple: false
multiple_sep: ";"
- type: "string"
name: "--obs_output_probabilities"
description: "In which. obs slot to store the probabilities of the predicted labels."
info: null
default:
- "scanvi_proba"
required: false
direction: "input"
multiple: false
multiple_sep: ";"
- type: "string"
name: "--output_compression"
description: "Compression format to use for the output AnnData and/or Mudata objects.\n\
By default no compression is applied.\n"
info: null
example:
- "gzip"
required: false
choices:
- "gzip"
- "lzf"
direction: "input"
multiple: false
multiple_sep: ";"
- name: "scANVI training arguments"
arguments:
- type: "boolean"
name: "--early_stopping"
description: "Whether to perform early stopping with respect to the validation\
\ set."
info: null
required: false
direction: "input"
multiple: false
multiple_sep: ";"
- type: "string"
name: "--early_stopping_monitor"
description: "Metric logged during validation set epoch."
info: null
default:
- "elbo_validation"
required: false
choices:
- "elbo_validation"
- "reconstruction_loss_validation"
- "kl_local_validation"
direction: "input"
multiple: false
multiple_sep: ";"
- type: "integer"
name: "--early_stopping_patience"
description: "Number of validation epochs with no improvement after which training\
\ will be stopped."
info: null
default:
- 45
required: false
min: 1
direction: "input"
multiple: false
multiple_sep: ";"
- type: "double"
name: "--early_stopping_min_delta"
description: "Minimum change in the monitored quantity to qualify as an improvement,\
\ i.e. an absolute change of less than min_delta, will count as no improvement."
info: null
default:
- 0.0
required: false
min: 0.0
direction: "input"
multiple: false
multiple_sep: ";"
- type: "integer"
name: "--max_epochs"
description: "Number of passes through the dataset, defaults to (20000 / number\
\ of cells) * 400 or 400; whichever is smallest."
info: null
required: false
direction: "input"
multiple: false
multiple_sep: ";"
- type: "boolean"
name: "--reduce_lr_on_plateau"
description: "Whether to monitor validation loss and reduce learning rate when\
\ validation set `lr_scheduler_metric` plateaus."
info: null
default:
- true
required: false
direction: "input"
multiple: false
multiple_sep: ";"
- type: "double"
name: "--lr_factor"
description: "Factor to reduce learning rate."
info: null
default:
- 0.6
required: false
min: 0.0
direction: "input"
multiple: false
multiple_sep: ";"
- type: "double"
name: "--lr_patience"
description: "Number of epochs with no improvement after which learning rate will\
\ be reduced."
info: null
default:
- 30.0
required: false
min: 0.0
direction: "input"
multiple: false
multiple_sep: ";"
resources:
- type: "python_script"
path: "script.py"
is_executable: true
- type: "file"
path: "subset_vars.py"
- type: "file"
path: "compress_h5mu.py"
- type: "file"
path: "set_var_index.py"
- type: "file"
path: "setup_logger.py"
- type: "file"
path: "nextflow_labels.config"
dest: "nextflow_labels.config"
description: "scANVI () is a semi-supervised model for single-cell transcriptomics\
\ data. scANVI is an scVI extension that can leverage the cell type knowledge for\
\ a subset of the cells present in the data sets to infer the states of the rest\
\ of the cells.\nThis component will instantiate a scANVI model from a pre-trained\
\ scVI model, integrate the data and perform label prediction.\n"
test_resources:
- type: "python_script"
path: "test.py"
is_executable: true
- type: "file"
path: "scvi_model"
- type: "file"
path: "TS_Blood_filtered.h5mu"
- type: "file"
path: "pbmc_1k_protein_v3_mms.h5mu"
info: null
status: "enabled"
scope:
image: "public"
target: "public"
license: "MIT"
links:
repository: "https://github.com/openpipelines-bio/openpipeline"
docker_registry: "ghcr.io"
runners:
- type: "executable"
id: "executable"
docker_setup_strategy: "ifneedbepullelsecachedbuild"
- type: "nextflow"
id: "nextflow"
directives:
label:
- "midcpu"
- "midmem"
- "gpu"
- "highdisk"
tag: "$id"
auto:
simplifyInput: true
simplifyOutput: false
transcript: false
publish: false
config:
labels:
mem1gb: "memory = 1000000000.B"
mem2gb: "memory = 2000000000.B"
mem5gb: "memory = 5000000000.B"
mem10gb: "memory = 10000000000.B"
mem20gb: "memory = 20000000000.B"
mem50gb: "memory = 50000000000.B"
mem100gb: "memory = 100000000000.B"
mem200gb: "memory = 200000000000.B"
mem500gb: "memory = 500000000000.B"
mem1tb: "memory = 1000000000000.B"
mem2tb: "memory = 2000000000000.B"
mem5tb: "memory = 5000000000000.B"
mem10tb: "memory = 10000000000000.B"
mem20tb: "memory = 20000000000000.B"
mem50tb: "memory = 50000000000000.B"
mem100tb: "memory = 100000000000000.B"
mem200tb: "memory = 200000000000000.B"
mem500tb: "memory = 500000000000000.B"
mem1gib: "memory = 1073741824.B"
mem2gib: "memory = 2147483648.B"
mem4gib: "memory = 4294967296.B"
mem8gib: "memory = 8589934592.B"
mem16gib: "memory = 17179869184.B"
mem32gib: "memory = 34359738368.B"
mem64gib: "memory = 68719476736.B"
mem128gib: "memory = 137438953472.B"
mem256gib: "memory = 274877906944.B"
mem512gib: "memory = 549755813888.B"
mem1tib: "memory = 1099511627776.B"
mem2tib: "memory = 2199023255552.B"
mem4tib: "memory = 4398046511104.B"
mem8tib: "memory = 8796093022208.B"
mem16tib: "memory = 17592186044416.B"
mem32tib: "memory = 35184372088832.B"
mem64tib: "memory = 70368744177664.B"
mem128tib: "memory = 140737488355328.B"
mem256tib: "memory = 281474976710656.B"
mem512tib: "memory = 562949953421312.B"
cpu1: "cpus = 1"
cpu2: "cpus = 2"
cpu5: "cpus = 5"
cpu10: "cpus = 10"
cpu20: "cpus = 20"
cpu50: "cpus = 50"
cpu100: "cpus = 100"
cpu200: "cpus = 200"
cpu500: "cpus = 500"
cpu1000: "cpus = 1000"
script:
- "includeConfig(\"nextflow_labels.config\")"
debug: false
container: "docker"
engines:
- type: "docker"
id: "docker"
image: "nvcr.io/nvidia/pytorch:25.05-py3"
target_tag: "3.0.0"
namespace_separator: "/"
setup:
- type: "python"
user: false
packages:
- "anndata~=0.11.1"
- "mudata~=0.3.1"
- "scanpy~=1.10.4"
script:
- "exec(\"try:\\n import awkward\\nexcept ModuleNotFoundError:\\n exit(0)\\\
nelse: exit(1)\")"
upgrade: true
- type: "python"
user: false
packages:
- "jax[cuda]"
- "scvi-tools~=1.3.1"
upgrade: true
test_setup:
- type: "python"
user: false
packages:
- "viashpy==0.8.0"
upgrade: true
entrypoint: []
cmd: null
build_info:
config: "src/annotate/scanvi/config.vsh.yaml"
runner: "nextflow"
engine: "docker"
output: "target/nextflow/annotate/scanvi"
executable: "target/nextflow/annotate/scanvi/main.nf"
viash_version: "0.9.4"
git_commit: "706b5ce24d313dcf947b7d9fe929630f1ad204e7"
git_remote: "https://github.com/openpipelines-bio/openpipeline"
git_tag: "2.1.0-2-g706b5ce24d3"
package_config:
name: "openpipeline"
summary: "Best-practice workflows for single-cell multi-omics analyses.\n"
description: "OpenPipelines are extensible single cell analysis pipelines for reproducible\
\ and large-scale single cell processing using [Viash](https://viash.io) and [Nextflow](https://www.nextflow.io/).\n\
\nIn terms of workflows, the following has been made available, but keep in mind\
\ that\nindividual tools and functionality can be executed as standalone components\
\ as well.\n\n * Demultiplexing: conversion of raw sequencing data to FASTQ objects.\n\
\ * Ingestion: Read mapping and generating a count matrix.\n * Single sample\
\ processing: cell filtering and doublet detection.\n * Multisample processing:\
\ Count transformation, normalization, QC metric calulations.\n * Integration:\
\ Clustering, integration and batch correction using single and multimodal methods.\n\
\ * Downstream analysis workflows\n"
info:
test_resources:
- type: "s3"
path: "s3://openpipelines-data"
dest: "resources_test"
nextflow_labels_ci:
- path: "src/workflows/utils/labels_ci.config"
description: "Adds the correct memory and CPU labels when running on the Viash\
\ Hub CI."
viash_version: "0.9.4"
source: "src"
target: "target"
config_mods:
- ".resources += {path: '/src/workflows/utils/labels.config', dest: 'nextflow_labels.config'}\n\
.runners[.type == 'nextflow'].config.script := 'includeConfig(\"nextflow_labels.config\"\
)'"
- ".version := \"3.0.0\""
- ".engines[.type == 'docker'].target_tag := '3.0.0'"
keywords:
- "single-cell"
- "multimodal"
license: "MIT"
organization: "openpipelines-bio"
links:
repository: "https://github.com/openpipelines-bio/openpipeline"
docker_registry: "ghcr.io"
homepage: "https://openpipelines.bio"
documentation: "https://openpipelines.bio/fundamentals"
issue_tracker: "https://github.com/openpipelines-bio/openpipeline/issues"

View File

@@ -0,0 +1,87 @@
import shutil
from anndata import AnnData
from mudata import write_h5ad
from h5py import File as H5File
from h5py import Group, Dataset
from pathlib import Path
from typing import Union, Literal
from functools import partial
def compress_h5mu(
input_path: Union[str, Path],
output_path: Union[str, Path],
compression: Union[Literal["gzip"], Literal["lzf"]],
):
input_path, output_path = str(input_path), str(output_path)
def copy_attributes(in_object, out_object):
for key, value in in_object.attrs.items():
out_object.attrs[key] = value
def visit_path(
output_h5: H5File,
compression: Union[Literal["gzip"], Literal["lzf"]],
name: str,
object: Union[Group, Dataset],
):
if isinstance(object, Group):
new_group = output_h5.create_group(name)
copy_attributes(object, new_group)
elif isinstance(object, Dataset):
# Compression only works for non-scalar Dataset objects
# Scalar objects dont have a shape defined
if not object.compression and object.shape not in [None, ()]:
new_dataset = output_h5.create_dataset(
name, data=object, compression=compression
)
copy_attributes(object, new_dataset)
else:
output_h5.copy(object, name)
else:
raise NotImplementedError(
f"Could not copy element {name}, "
f"type has not been implemented yet: {type(object)}"
)
with (
H5File(input_path, "r") as input_h5,
H5File(output_path, "w", userblock_size=512) as output_h5,
):
copy_attributes(input_h5, output_h5)
input_h5.visititems(partial(visit_path, output_h5, compression))
with open(input_path, "rb") as input_bytes:
# Mudata puts metadata like this in the first 512 bytes:
# MuData (format-version=0.1.0;creator=muon;creator-version=0.2.0)
# See mudata/_core/io.py, read_h5mu() function
starting_metadata = input_bytes.read(100)
# The metadata is padded with extra null bytes up until 512 bytes
truncate_location = starting_metadata.find(b"\x00")
starting_metadata = starting_metadata[:truncate_location]
with open(output_path, "br+") as f:
nbytes = f.write(starting_metadata)
f.write(b"\0" * (512 - nbytes))
def write_h5ad_to_h5mu_with_compression(
output_file: Union[str, Path],
h5mu: Union[str, Path],
modality_name: str,
modality_data: AnnData,
output_compression=None,
):
output_file = Path(output_file)
h5mu = Path(h5mu)
output_file_uncompressed = (
output_file.with_name(output_file.stem + "_uncompressed.h5mu")
if output_compression
else output_file
)
shutil.copyfile(h5mu, output_file_uncompressed)
write_h5ad(filename=output_file_uncompressed, mod=modality_name, data=modality_data)
if output_compression:
compress_h5mu(
output_file_uncompressed, output_file, compression=output_compression
)
output_file_uncompressed.unlink()

View File

@@ -0,0 +1,126 @@
manifest {
name = 'annotate/scanvi'
mainScript = 'main.nf'
nextflowVersion = '!>=20.12.1-edge'
version = '3.0.0'
description = 'scANVI () is a semi-supervised model for single-cell transcriptomics data. scANVI is an scVI extension that can leverage the cell type knowledge for a subset of the cells present in the data sets to infer the states of the rest of the cells.\nThis component will instantiate a scANVI model from a pre-trained scVI model, integrate the data and perform label prediction.\n'
author = 'Dorien Roosen, Jakub Majercik, Weiwei Schultz'
}
process.container = 'nextflow/bash:latest'
// detect tempdir
tempDir = java.nio.file.Paths.get(
System.getenv('NXF_TEMP') ?:
System.getenv('VIASH_TEMP') ?:
System.getenv('TEMPDIR') ?:
System.getenv('TMPDIR') ?:
'/tmp'
).toAbsolutePath()
profiles {
no_publish {
process {
withName: '.*' {
publishDir = [
enabled: false
]
}
}
}
mount_temp {
docker.temp = tempDir
podman.temp = tempDir
charliecloud.temp = tempDir
}
docker {
docker.enabled = true
// docker.userEmulation = true
singularity.enabled = false
podman.enabled = false
shifter.enabled = false
charliecloud.enabled = false
}
singularity {
singularity.enabled = true
singularity.autoMounts = true
docker.enabled = false
podman.enabled = false
shifter.enabled = false
charliecloud.enabled = false
}
podman {
podman.enabled = true
docker.enabled = false
singularity.enabled = false
shifter.enabled = false
charliecloud.enabled = false
}
shifter {
shifter.enabled = true
docker.enabled = false
singularity.enabled = false
podman.enabled = false
charliecloud.enabled = false
}
charliecloud {
charliecloud.enabled = true
docker.enabled = false
singularity.enabled = false
podman.enabled = false
shifter.enabled = false
}
}
process{
withLabel: mem1gb { memory = 1000000000.B }
withLabel: mem2gb { memory = 2000000000.B }
withLabel: mem5gb { memory = 5000000000.B }
withLabel: mem10gb { memory = 10000000000.B }
withLabel: mem20gb { memory = 20000000000.B }
withLabel: mem50gb { memory = 50000000000.B }
withLabel: mem100gb { memory = 100000000000.B }
withLabel: mem200gb { memory = 200000000000.B }
withLabel: mem500gb { memory = 500000000000.B }
withLabel: mem1tb { memory = 1000000000000.B }
withLabel: mem2tb { memory = 2000000000000.B }
withLabel: mem5tb { memory = 5000000000000.B }
withLabel: mem10tb { memory = 10000000000000.B }
withLabel: mem20tb { memory = 20000000000000.B }
withLabel: mem50tb { memory = 50000000000000.B }
withLabel: mem100tb { memory = 100000000000000.B }
withLabel: mem200tb { memory = 200000000000000.B }
withLabel: mem500tb { memory = 500000000000000.B }
withLabel: mem1gib { memory = 1073741824.B }
withLabel: mem2gib { memory = 2147483648.B }
withLabel: mem4gib { memory = 4294967296.B }
withLabel: mem8gib { memory = 8589934592.B }
withLabel: mem16gib { memory = 17179869184.B }
withLabel: mem32gib { memory = 34359738368.B }
withLabel: mem64gib { memory = 68719476736.B }
withLabel: mem128gib { memory = 137438953472.B }
withLabel: mem256gib { memory = 274877906944.B }
withLabel: mem512gib { memory = 549755813888.B }
withLabel: mem1tib { memory = 1099511627776.B }
withLabel: mem2tib { memory = 2199023255552.B }
withLabel: mem4tib { memory = 4398046511104.B }
withLabel: mem8tib { memory = 8796093022208.B }
withLabel: mem16tib { memory = 17592186044416.B }
withLabel: mem32tib { memory = 35184372088832.B }
withLabel: mem64tib { memory = 70368744177664.B }
withLabel: mem128tib { memory = 140737488355328.B }
withLabel: mem256tib { memory = 281474976710656.B }
withLabel: mem512tib { memory = 562949953421312.B }
withLabel: cpu1 { cpus = 1 }
withLabel: cpu2 { cpus = 2 }
withLabel: cpu5 { cpus = 5 }
withLabel: cpu10 { cpus = 10 }
withLabel: cpu20 { cpus = 20 }
withLabel: cpu50 { cpus = 50 }
withLabel: cpu100 { cpus = 100 }
withLabel: cpu200 { cpus = 200 }
withLabel: cpu500 { cpus = 500 }
withLabel: cpu1000 { cpus = 1000 }
}
includeConfig("nextflow_labels.config")

View File

@@ -0,0 +1,48 @@
process {
// Default resources for components that hardly do any processing
memory = { 2.GB * task.attempt }
cpus = 1
// Retry for exit codes that have something to do with memory issues
errorStrategy = { task.exitStatus in 137..140 ? 'retry' : 'terminate' }
maxRetries = 3
// The memory a task is assinged increases with each attempt
// uncomment the line below and adjust the value to set a global upper limit on the memory.
// resourceLimits = [ memory: 240.Gb ]
// CPU resources
withLabel: singlecpu { cpus = 1 }
withLabel: lowcpu { cpus = 4 }
withLabel: midcpu { cpus = 10 }
withLabel: highcpu { cpus = 20 }
// Memory resources
withLabel: lowmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 4.GB * task.attempt } }
withLabel: midmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 25.GB * task.attempt } }
withLabel: highmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 50.GB * task.attempt } }
withLabel: veryhighmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 75.GB * task.attempt } }
// Disk space
withLabel: lowdisk {
disk = {process.disk ? process.disk : null}
}
withLabel: middisk {
disk = {process.disk ? process.disk : null}
}
withLabel: highdisk {
disk = {process.disk ? process.disk : null}
}
withLabel: veryhighdisk {
disk = {process.disk ? process.disk : null}
}
// NOTE: The above labels intentionally do not have an effect by default.
// The user should set the disk space requirements by adding the following
// to the compute environment:
//
// withLabel: lowdisk { disk = { 20.GB * task.attempt } }
// withLabel: middisk { disk = { 100.GB * task.attempt } }
// withLabel: highdisk { disk = { 200.GB * task.attempt } }
// withLabel: veryhighdisk { disk = { 500.GB * task.attempt } }
}

View File

@@ -0,0 +1,35 @@
# Inputs
input: # please fill in - example: "path/to/file"
modality: "rna"
# input_layer: "foo"
# var_input: "foo"
# var_gene_names: "foo"
obs_labels: # please fill in - example: "foo"
unlabeled_category: "Unknown"
# scVI Model
scvi_model: # please fill in - example: "scvi_model.pt"
# Outputs
# output: "$id.$key.output"
# output_model: "$id.$key.output_model"
obsm_output: "X_scanvi_integrated"
obs_output_predictions: "scanvi_pred"
obs_output_probabilities: "scanvi_proba"
# output_compression: "gzip"
# scANVI training arguments
# early_stopping: true
early_stopping_monitor: "elbo_validation"
early_stopping_patience: 45
early_stopping_min_delta: 0.0
# max_epochs: 123
reduce_lr_on_plateau: true
lr_factor: 0.6
lr_patience: 30.0
# Nextflow input-output arguments
publish_dir: # please fill in - example: "output/"
# param_list: "my_params.yaml"
# Arguments

View File

@@ -0,0 +1,217 @@
{
"$schema": "https://json-schema.org/draft/2020-12/schema",
"title": "scanvi",
"description": "scANVI () is a semi-supervised model for single-cell transcriptomics data. scANVI is an scVI extension that can leverage the cell type knowledge for a subset of the cells present in the data sets to infer the states of the rest of the cells.\nThis component will instantiate a scANVI model from a pre-trained scVI model, integrate the data and perform label prediction.\n",
"type": "object",
"$defs": {
"Dataset input": {
"title": "Dataset input",
"type": "object",
"description": "Dataset input using nf-tower \"dataset\" or \"data explorer\". Allows for the input of multiple parameter sets to initialise a Nextflow channel.",
"properties": {
"param_list": {
"description": "Dataset input can either be a list of maps, a csv file, a json file, a yaml file, or simply a yaml blob. The names of the input fields (e.g. csv columns, json keys) need to be an exact match with the workflow input parameters.",
"type": "string",
"default": "",
"format": "file-path",
"mimetype": "text/csv"
}
}
},
"inputs": {
"title": "Inputs",
"type": "object",
"description": "No description",
"properties": {
"input": {
"type": "string",
"format": "path",
"exists": true,
"description": "Input h5mu file",
"help_text": "Type: `file`, multiple: `False`, required, direction: `input`. "
},
"modality": {
"type": "string",
"description": "Which modality from the input MuData file to process.\n",
"help_text": "Type: `string`, multiple: `False`, default: `\"rna\"`. ",
"default": "rna"
},
"input_layer": {
"type": "string",
"description": "Input layer to use",
"help_text": "Type: `string`, multiple: `False`. "
},
"var_input": {
"type": "string",
"description": ".var column containing highly variable genes that were used to train the scVi model",
"help_text": "Type: `string`, multiple: `False`. "
},
"var_gene_names": {
"type": "string",
"description": ".var column containing gene names",
"help_text": "Type: `string`, multiple: `False`. "
},
"obs_labels": {
"type": "string",
"description": ".obs field containing the labels",
"help_text": "Type: `string`, multiple: `False`, required. "
},
"unlabeled_category": {
"type": "string",
"description": "Value in the --obs_labels field that indicates unlabeled observations\n",
"help_text": "Type: `string`, multiple: `False`, default: `\"Unknown\"`. ",
"default": "Unknown"
}
}
},
"outputs": {
"title": "Outputs",
"type": "object",
"description": "No description",
"properties": {
"output": {
"type": "string",
"format": "path",
"description": "Output h5mu file.",
"help_text": "Type: `file`, multiple: `False`, required, default: `\"$id.$key.output\"`, direction: `output`. ",
"default": "$id.$key.output"
},
"output_model": {
"type": "string",
"format": "path",
"description": "Folder where the state of the trained model will be saved to.",
"help_text": "Type: `file`, multiple: `False`, default: `\"$id.$key.output_model\"`, direction: `output`. ",
"default": "$id.$key.output_model"
},
"obsm_output": {
"type": "string",
"description": "In which .obsm slot to store the resulting integrated embedding.",
"help_text": "Type: `string`, multiple: `False`, default: `\"X_scanvi_integrated\"`. ",
"default": "X_scanvi_integrated"
},
"obs_output_predictions": {
"type": "string",
"description": "In which .obs slot to store the predicted labels.",
"help_text": "Type: `string`, multiple: `False`, default: `\"scanvi_pred\"`. ",
"default": "scanvi_pred"
},
"obs_output_probabilities": {
"type": "string",
"description": "In which",
"help_text": "Type: `string`, multiple: `False`, default: `\"scanvi_proba\"`. ",
"default": "scanvi_proba"
},
"output_compression": {
"type": "string",
"description": "Compression format to use for the output AnnData and/or Mudata objects.\nBy default no compression is applied.\n",
"help_text": "Type: `string`, multiple: `False`, example: `\"gzip\"`, choices: ``gzip`, `lzf``. ",
"enum": [
"gzip",
"lzf"
]
}
}
},
"scvi model": {
"title": "scVI Model",
"type": "object",
"description": "No description",
"properties": {
"scvi_model": {
"type": "string",
"format": "path",
"exists": true,
"description": "Pretrained SCVI reference model to initialize the SCANVI model with.",
"help_text": "Type: `file`, multiple: `False`, required, direction: `input`, example: `\"scvi_model.pt\"`. "
}
}
},
"scanvi training arguments": {
"title": "scANVI training arguments",
"type": "object",
"description": "No description",
"properties": {
"early_stopping": {
"type": "boolean",
"description": "Whether to perform early stopping with respect to the validation set.",
"help_text": "Type: `boolean`, multiple: `False`. "
},
"early_stopping_monitor": {
"type": "string",
"description": "Metric logged during validation set epoch.",
"help_text": "Type: `string`, multiple: `False`, default: `\"elbo_validation\"`, choices: ``elbo_validation`, `reconstruction_loss_validation`, `kl_local_validation``. ",
"enum": [
"elbo_validation",
"reconstruction_loss_validation",
"kl_local_validation"
],
"default": "elbo_validation"
},
"early_stopping_patience": {
"type": "integer",
"description": "Number of validation epochs with no improvement after which training will be stopped.",
"help_text": "Type: `integer`, multiple: `False`, default: `45`. ",
"default": 45
},
"early_stopping_min_delta": {
"type": "number",
"description": "Minimum change in the monitored quantity to qualify as an improvement, i.e",
"help_text": "Type: `double`, multiple: `False`, default: `0.0`. ",
"default": 0.0
},
"max_epochs": {
"type": "integer",
"description": "Number of passes through the dataset, defaults to (20000 / number of cells) * 400 or 400; whichever is smallest.",
"help_text": "Type: `integer`, multiple: `False`. "
},
"reduce_lr_on_plateau": {
"type": "boolean",
"description": "Whether to monitor validation loss and reduce learning rate when validation set `lr_scheduler_metric` plateaus.",
"help_text": "Type: `boolean`, multiple: `False`, default: `true`. ",
"default": true
},
"lr_factor": {
"type": "number",
"description": "Factor to reduce learning rate.",
"help_text": "Type: `double`, multiple: `False`, default: `0.6`. ",
"default": 0.6
},
"lr_patience": {
"type": "number",
"description": "Number of epochs with no improvement after which learning rate will be reduced.",
"help_text": "Type: `double`, multiple: `False`, default: `30.0`. ",
"default": 30.0
}
}
},
"nextflow input-output arguments": {
"title": "Nextflow input-output arguments",
"type": "object",
"description": "Input/output parameters for Nextflow itself. Please note that both publishDir and publish_dir are supported but at least one has to be configured.",
"properties": {
"publish_dir": {
"type": "string",
"description": "Path to an output directory.",
"help_text": "Type: `string`, multiple: `False`, required, example: `\"output/\"`. "
}
}
}
},
"allOf": [
{
"$ref": "#/$defs/inputs"
},
{
"$ref": "#/$defs/outputs"
},
{
"$ref": "#/$defs/scvi model"
},
{
"$ref": "#/$defs/scanvi training arguments"
},
{
"$ref": "#/$defs/nextflow input-output arguments"
}
]
}

View File

@@ -0,0 +1,24 @@
import anndata as ad
import re
def set_var_index(adata: ad.AnnData, var_name: str | None = None) -> ad.AnnData:
"""Sanitize gene names and set the index of the .var DataFrame.
Parameters
----------
adata : AnnData
Annotated data object
var_name : str | None
Name of the column in `adata.var` that contains the gene names, if None, the existing index will be sanitized but not replaced.
Returns
-------
AnnData
Copy of `adata` with sanitized and replaced index
"""
if var_name:
adata.var.index = [re.sub("\\.[0-9]+$", "", s) for s in adata.var[var_name]]
else:
adata.var.index = [re.sub("\\.[0-9]+$", "", s) for s in adata.var.index]
return adata

View File

@@ -0,0 +1,12 @@
def setup_logger():
import logging
from sys import stdout
logger = logging.getLogger()
logger.setLevel(logging.INFO)
console_handler = logging.StreamHandler(stdout)
logFormatter = logging.Formatter("%(asctime)s %(levelname)-8s %(message)s")
console_handler.setFormatter(logFormatter)
logger.addHandler(console_handler)
return logger

View File

@@ -0,0 +1,31 @@
def subset_vars(adata, subset_col):
"""Subset AnnData object on highly variable genes
Parameters
----------
adata : AnnData
Annotated data object
subset_col : str
Name of the boolean column in `adata.var` that contains the information if features should be used or not
Returns
-------
AnnData
Copy of `adata` with subsetted features
"""
if subset_col not in adata.var.columns:
raise ValueError(
f"Requested to use .var column '{subset_col}' as a selection of genes, but the column is not available."
)
if adata.var[subset_col].dtype == "boolean":
assert adata.var[subset_col].isna().sum() == 0, (
f"The .var column `{subset_col}` contains NaN values. Can not subset data."
)
adata.var[subset_col] = adata.var[subset_col].astype("bool")
assert adata.var[subset_col].dtype == "bool", (
f"Expected dtype of .var column '{subset_col}' to be `bool`, but found {adata.var[subset_col].dtype}. Can not subset data."
)
return adata[:, adata.var[subset_col]].copy()

View File

@@ -0,0 +1,300 @@
name: "leiden"
namespace: "cluster"
version: "3.0.0"
authors:
- name: "Dries De Maeyer"
roles:
- "maintainer"
info:
role: "Core Team Member"
links:
email: "ddemaeyer@gmail.com"
github: "ddemaeyer"
linkedin: "dries-de-maeyer-b46a814"
organizations:
- name: "Janssen Pharmaceuticals"
href: "https://www.janssen.com"
role: "Principal Scientist"
argument_groups:
- name: "Arguments"
arguments:
- type: "file"
name: "--input"
alternatives:
- "-i"
description: "Input file."
info: null
example:
- "input.h5mu"
must_exist: true
create_parent: true
required: true
direction: "input"
multiple: false
multiple_sep: ";"
- type: "string"
name: "--modality"
description: "Which modality from the input MuData file to process.\n"
info: null
default:
- "rna"
required: false
direction: "input"
multiple: false
multiple_sep: ";"
- type: "string"
name: "--obsp_connectivities"
description: "In which .obsp slot the neighbor connectivities can be found."
info: null
default:
- "connectivities"
required: false
direction: "input"
multiple: false
multiple_sep: ";"
- type: "file"
name: "--output"
alternatives:
- "-o"
description: "Output file."
info: null
example:
- "output.h5mu"
must_exist: true
create_parent: true
required: true
direction: "output"
multiple: false
multiple_sep: ";"
- type: "string"
name: "--obsm_name"
description: "Name of the .obsm key under which to add the cluster labels.\nThe\
\ name of the columns in the matrix will correspond to the resolutions.\n"
info: null
default:
- "leiden"
required: false
direction: "input"
multiple: false
multiple_sep: ";"
- type: "double"
name: "--resolution"
description: "A parameter value controlling the coarseness of the clustering.\
\ Higher values lead to more clusters.\nMultiple values will result in clustering\
\ being performed multiple times.\n"
info: null
default:
- 1.0
required: true
direction: "input"
multiple: true
multiple_sep: ";"
- type: "string"
name: "--output_compression"
description: "Compression format to use for the output AnnData and/or Mudata objects.\n\
By default no compression is applied.\n"
info: null
example:
- "gzip"
required: false
choices:
- "gzip"
- "lzf"
direction: "input"
multiple: false
multiple_sep: ";"
resources:
- type: "python_script"
path: "script.py"
is_executable: true
- type: "file"
path: "setup_logger.py"
- type: "file"
path: "compress_h5mu.py"
- type: "file"
path: "nextflow_labels.config"
dest: "nextflow_labels.config"
description: "Cluster cells using the [Leiden algorithm] [Traag18] implemented in\
\ the [Scanpy framework] [Wolf18]. \nLeiden is an improved version of the [Louvain\
\ algorithm] [Blondel08]. \nIt has been proposed for single-cell analysis by [Levine15]\
\ [Levine15]. \nThis requires having ran `neighbors/find_neighbors` or `neighbors/bbknn`\
\ first.\n\n[Blondel08]: Blondel et al. (2008), Fast unfolding of communities in\
\ large networks, J. Stat. Mech. \n[Levine15]: Levine et al. (2015), Data-Driven\
\ Phenotypic Dissection of AML Reveals Progenitor-like Cells that Correlate with\
\ Prognosis, Cell. \n[Traag18]: Traag et al. (2018), From Louvain to Leiden: guaranteeing\
\ well-connected communities arXiv. \n[Wolf18]: Wolf et al. (2018), Scanpy: large-scale\
\ single-cell gene expression data analysis, Genome Biology. \n"
test_resources:
- type: "python_script"
path: "test.py"
is_executable: true
- type: "file"
path: "pbmc_1k_protein_v3"
info: null
status: "enabled"
scope:
image: "public"
target: "public"
license: "MIT"
links:
repository: "https://github.com/openpipelines-bio/openpipeline"
docker_registry: "ghcr.io"
runners:
- type: "executable"
id: "executable"
docker_setup_strategy: "ifneedbepullelsecachedbuild"
- type: "nextflow"
id: "nextflow"
directives:
label:
- "highcpu"
- "midmem"
- "middisk"
tag: "$id"
auto:
simplifyInput: true
simplifyOutput: false
transcript: false
publish: false
config:
labels:
mem1gb: "memory = 1000000000.B"
mem2gb: "memory = 2000000000.B"
mem5gb: "memory = 5000000000.B"
mem10gb: "memory = 10000000000.B"
mem20gb: "memory = 20000000000.B"
mem50gb: "memory = 50000000000.B"
mem100gb: "memory = 100000000000.B"
mem200gb: "memory = 200000000000.B"
mem500gb: "memory = 500000000000.B"
mem1tb: "memory = 1000000000000.B"
mem2tb: "memory = 2000000000000.B"
mem5tb: "memory = 5000000000000.B"
mem10tb: "memory = 10000000000000.B"
mem20tb: "memory = 20000000000000.B"
mem50tb: "memory = 50000000000000.B"
mem100tb: "memory = 100000000000000.B"
mem200tb: "memory = 200000000000000.B"
mem500tb: "memory = 500000000000000.B"
mem1gib: "memory = 1073741824.B"
mem2gib: "memory = 2147483648.B"
mem4gib: "memory = 4294967296.B"
mem8gib: "memory = 8589934592.B"
mem16gib: "memory = 17179869184.B"
mem32gib: "memory = 34359738368.B"
mem64gib: "memory = 68719476736.B"
mem128gib: "memory = 137438953472.B"
mem256gib: "memory = 274877906944.B"
mem512gib: "memory = 549755813888.B"
mem1tib: "memory = 1099511627776.B"
mem2tib: "memory = 2199023255552.B"
mem4tib: "memory = 4398046511104.B"
mem8tib: "memory = 8796093022208.B"
mem16tib: "memory = 17592186044416.B"
mem32tib: "memory = 35184372088832.B"
mem64tib: "memory = 70368744177664.B"
mem128tib: "memory = 140737488355328.B"
mem256tib: "memory = 281474976710656.B"
mem512tib: "memory = 562949953421312.B"
cpu1: "cpus = 1"
cpu2: "cpus = 2"
cpu5: "cpus = 5"
cpu10: "cpus = 10"
cpu20: "cpus = 20"
cpu50: "cpus = 50"
cpu100: "cpus = 100"
cpu200: "cpus = 200"
cpu500: "cpus = 500"
cpu1000: "cpus = 1000"
script:
- "includeConfig(\"nextflow_labels.config\")"
debug: false
container: "docker"
engines:
- type: "docker"
id: "docker"
image: "python:3.13-slim"
target_tag: "3.0.0"
namespace_separator: "/"
setup:
- type: "apt"
packages:
- "procps"
interactive: false
- type: "python"
user: false
packages:
- "anndata~=0.11.1"
- "mudata~=0.3.1"
- "scanpy~=1.10.4"
- "leidenalg~=0.10.0"
script:
- "exec(\"try:\\n import awkward\\nexcept ModuleNotFoundError:\\n exit(0)\\\
nelse: exit(1)\")"
upgrade: true
test_setup:
- type: "apt"
packages:
- "git"
interactive: false
- type: "python"
user: false
packages:
- "viashpy==0.8.0"
github:
- "openpipelines-bio/core#subdirectory=packages/python/openpipeline_testutils"
upgrade: true
entrypoint: []
cmd: null
build_info:
config: "src/cluster/leiden/config.vsh.yaml"
runner: "nextflow"
engine: "docker"
output: "target/nextflow/cluster/leiden"
executable: "target/nextflow/cluster/leiden/main.nf"
viash_version: "0.9.4"
git_commit: "706b5ce24d313dcf947b7d9fe929630f1ad204e7"
git_remote: "https://github.com/openpipelines-bio/openpipeline"
git_tag: "2.1.0-2-g706b5ce24d3"
package_config:
name: "openpipeline"
summary: "Best-practice workflows for single-cell multi-omics analyses.\n"
description: "OpenPipelines are extensible single cell analysis pipelines for reproducible\
\ and large-scale single cell processing using [Viash](https://viash.io) and [Nextflow](https://www.nextflow.io/).\n\
\nIn terms of workflows, the following has been made available, but keep in mind\
\ that\nindividual tools and functionality can be executed as standalone components\
\ as well.\n\n * Demultiplexing: conversion of raw sequencing data to FASTQ objects.\n\
\ * Ingestion: Read mapping and generating a count matrix.\n * Single sample\
\ processing: cell filtering and doublet detection.\n * Multisample processing:\
\ Count transformation, normalization, QC metric calulations.\n * Integration:\
\ Clustering, integration and batch correction using single and multimodal methods.\n\
\ * Downstream analysis workflows\n"
info:
test_resources:
- type: "s3"
path: "s3://openpipelines-data"
dest: "resources_test"
nextflow_labels_ci:
- path: "src/workflows/utils/labels_ci.config"
description: "Adds the correct memory and CPU labels when running on the Viash\
\ Hub CI."
viash_version: "0.9.4"
source: "src"
target: "target"
config_mods:
- ".resources += {path: '/src/workflows/utils/labels.config', dest: 'nextflow_labels.config'}\n\
.runners[.type == 'nextflow'].config.script := 'includeConfig(\"nextflow_labels.config\"\
)'"
- ".version := \"3.0.0\""
- ".engines[.type == 'docker'].target_tag := '3.0.0'"
keywords:
- "single-cell"
- "multimodal"
license: "MIT"
organization: "openpipelines-bio"
links:
repository: "https://github.com/openpipelines-bio/openpipeline"
docker_registry: "ghcr.io"
homepage: "https://openpipelines.bio"
documentation: "https://openpipelines.bio/fundamentals"
issue_tracker: "https://github.com/openpipelines-bio/openpipeline/issues"

View File

@@ -0,0 +1,87 @@
import shutil
from anndata import AnnData
from mudata import write_h5ad
from h5py import File as H5File
from h5py import Group, Dataset
from pathlib import Path
from typing import Union, Literal
from functools import partial
def compress_h5mu(
input_path: Union[str, Path],
output_path: Union[str, Path],
compression: Union[Literal["gzip"], Literal["lzf"]],
):
input_path, output_path = str(input_path), str(output_path)
def copy_attributes(in_object, out_object):
for key, value in in_object.attrs.items():
out_object.attrs[key] = value
def visit_path(
output_h5: H5File,
compression: Union[Literal["gzip"], Literal["lzf"]],
name: str,
object: Union[Group, Dataset],
):
if isinstance(object, Group):
new_group = output_h5.create_group(name)
copy_attributes(object, new_group)
elif isinstance(object, Dataset):
# Compression only works for non-scalar Dataset objects
# Scalar objects dont have a shape defined
if not object.compression and object.shape not in [None, ()]:
new_dataset = output_h5.create_dataset(
name, data=object, compression=compression
)
copy_attributes(object, new_dataset)
else:
output_h5.copy(object, name)
else:
raise NotImplementedError(
f"Could not copy element {name}, "
f"type has not been implemented yet: {type(object)}"
)
with (
H5File(input_path, "r") as input_h5,
H5File(output_path, "w", userblock_size=512) as output_h5,
):
copy_attributes(input_h5, output_h5)
input_h5.visititems(partial(visit_path, output_h5, compression))
with open(input_path, "rb") as input_bytes:
# Mudata puts metadata like this in the first 512 bytes:
# MuData (format-version=0.1.0;creator=muon;creator-version=0.2.0)
# See mudata/_core/io.py, read_h5mu() function
starting_metadata = input_bytes.read(100)
# The metadata is padded with extra null bytes up until 512 bytes
truncate_location = starting_metadata.find(b"\x00")
starting_metadata = starting_metadata[:truncate_location]
with open(output_path, "br+") as f:
nbytes = f.write(starting_metadata)
f.write(b"\0" * (512 - nbytes))
def write_h5ad_to_h5mu_with_compression(
output_file: Union[str, Path],
h5mu: Union[str, Path],
modality_name: str,
modality_data: AnnData,
output_compression=None,
):
output_file = Path(output_file)
h5mu = Path(h5mu)
output_file_uncompressed = (
output_file.with_name(output_file.stem + "_uncompressed.h5mu")
if output_compression
else output_file
)
shutil.copyfile(h5mu, output_file_uncompressed)
write_h5ad(filename=output_file_uncompressed, mod=modality_name, data=modality_data)
if output_compression:
compress_h5mu(
output_file_uncompressed, output_file, compression=output_compression
)
output_file_uncompressed.unlink()

View File

@@ -0,0 +1,126 @@
manifest {
name = 'cluster/leiden'
mainScript = 'main.nf'
nextflowVersion = '!>=20.12.1-edge'
version = '3.0.0'
description = 'Cluster cells using the [Leiden algorithm] [Traag18] implemented in the [Scanpy framework] [Wolf18]. \nLeiden is an improved version of the [Louvain algorithm] [Blondel08]. \nIt has been proposed for single-cell analysis by [Levine15] [Levine15]. \nThis requires having ran `neighbors/find_neighbors` or `neighbors/bbknn` first.\n\n[Blondel08]: Blondel et al. (2008), Fast unfolding of communities in large networks, J. Stat. Mech. \n[Levine15]: Levine et al. (2015), Data-Driven Phenotypic Dissection of AML Reveals Progenitor-like Cells that Correlate with Prognosis, Cell. \n[Traag18]: Traag et al. (2018), From Louvain to Leiden: guaranteeing well-connected communities arXiv. \n[Wolf18]: Wolf et al. (2018), Scanpy: large-scale single-cell gene expression data analysis, Genome Biology. \n'
author = 'Dries De Maeyer'
}
process.container = 'nextflow/bash:latest'
// detect tempdir
tempDir = java.nio.file.Paths.get(
System.getenv('NXF_TEMP') ?:
System.getenv('VIASH_TEMP') ?:
System.getenv('TEMPDIR') ?:
System.getenv('TMPDIR') ?:
'/tmp'
).toAbsolutePath()
profiles {
no_publish {
process {
withName: '.*' {
publishDir = [
enabled: false
]
}
}
}
mount_temp {
docker.temp = tempDir
podman.temp = tempDir
charliecloud.temp = tempDir
}
docker {
docker.enabled = true
// docker.userEmulation = true
singularity.enabled = false
podman.enabled = false
shifter.enabled = false
charliecloud.enabled = false
}
singularity {
singularity.enabled = true
singularity.autoMounts = true
docker.enabled = false
podman.enabled = false
shifter.enabled = false
charliecloud.enabled = false
}
podman {
podman.enabled = true
docker.enabled = false
singularity.enabled = false
shifter.enabled = false
charliecloud.enabled = false
}
shifter {
shifter.enabled = true
docker.enabled = false
singularity.enabled = false
podman.enabled = false
charliecloud.enabled = false
}
charliecloud {
charliecloud.enabled = true
docker.enabled = false
singularity.enabled = false
podman.enabled = false
shifter.enabled = false
}
}
process{
withLabel: mem1gb { memory = 1000000000.B }
withLabel: mem2gb { memory = 2000000000.B }
withLabel: mem5gb { memory = 5000000000.B }
withLabel: mem10gb { memory = 10000000000.B }
withLabel: mem20gb { memory = 20000000000.B }
withLabel: mem50gb { memory = 50000000000.B }
withLabel: mem100gb { memory = 100000000000.B }
withLabel: mem200gb { memory = 200000000000.B }
withLabel: mem500gb { memory = 500000000000.B }
withLabel: mem1tb { memory = 1000000000000.B }
withLabel: mem2tb { memory = 2000000000000.B }
withLabel: mem5tb { memory = 5000000000000.B }
withLabel: mem10tb { memory = 10000000000000.B }
withLabel: mem20tb { memory = 20000000000000.B }
withLabel: mem50tb { memory = 50000000000000.B }
withLabel: mem100tb { memory = 100000000000000.B }
withLabel: mem200tb { memory = 200000000000000.B }
withLabel: mem500tb { memory = 500000000000000.B }
withLabel: mem1gib { memory = 1073741824.B }
withLabel: mem2gib { memory = 2147483648.B }
withLabel: mem4gib { memory = 4294967296.B }
withLabel: mem8gib { memory = 8589934592.B }
withLabel: mem16gib { memory = 17179869184.B }
withLabel: mem32gib { memory = 34359738368.B }
withLabel: mem64gib { memory = 68719476736.B }
withLabel: mem128gib { memory = 137438953472.B }
withLabel: mem256gib { memory = 274877906944.B }
withLabel: mem512gib { memory = 549755813888.B }
withLabel: mem1tib { memory = 1099511627776.B }
withLabel: mem2tib { memory = 2199023255552.B }
withLabel: mem4tib { memory = 4398046511104.B }
withLabel: mem8tib { memory = 8796093022208.B }
withLabel: mem16tib { memory = 17592186044416.B }
withLabel: mem32tib { memory = 35184372088832.B }
withLabel: mem64tib { memory = 70368744177664.B }
withLabel: mem128tib { memory = 140737488355328.B }
withLabel: mem256tib { memory = 281474976710656.B }
withLabel: mem512tib { memory = 562949953421312.B }
withLabel: cpu1 { cpus = 1 }
withLabel: cpu2 { cpus = 2 }
withLabel: cpu5 { cpus = 5 }
withLabel: cpu10 { cpus = 10 }
withLabel: cpu20 { cpus = 20 }
withLabel: cpu50 { cpus = 50 }
withLabel: cpu100 { cpus = 100 }
withLabel: cpu200 { cpus = 200 }
withLabel: cpu500 { cpus = 500 }
withLabel: cpu1000 { cpus = 1000 }
}
includeConfig("nextflow_labels.config")

View File

@@ -0,0 +1,48 @@
process {
// Default resources for components that hardly do any processing
memory = { 2.GB * task.attempt }
cpus = 1
// Retry for exit codes that have something to do with memory issues
errorStrategy = { task.exitStatus in 137..140 ? 'retry' : 'terminate' }
maxRetries = 3
// The memory a task is assinged increases with each attempt
// uncomment the line below and adjust the value to set a global upper limit on the memory.
// resourceLimits = [ memory: 240.Gb ]
// CPU resources
withLabel: singlecpu { cpus = 1 }
withLabel: lowcpu { cpus = 4 }
withLabel: midcpu { cpus = 10 }
withLabel: highcpu { cpus = 20 }
// Memory resources
withLabel: lowmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 4.GB * task.attempt } }
withLabel: midmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 25.GB * task.attempt } }
withLabel: highmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 50.GB * task.attempt } }
withLabel: veryhighmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 75.GB * task.attempt } }
// Disk space
withLabel: lowdisk {
disk = {process.disk ? process.disk : null}
}
withLabel: middisk {
disk = {process.disk ? process.disk : null}
}
withLabel: highdisk {
disk = {process.disk ? process.disk : null}
}
withLabel: veryhighdisk {
disk = {process.disk ? process.disk : null}
}
// NOTE: The above labels intentionally do not have an effect by default.
// The user should set the disk space requirements by adding the following
// to the compute environment:
//
// withLabel: lowdisk { disk = { 20.GB * task.attempt } }
// withLabel: middisk { disk = { 100.GB * task.attempt } }
// withLabel: highdisk { disk = { 200.GB * task.attempt } }
// withLabel: veryhighdisk { disk = { 500.GB * task.attempt } }
}

View File

@@ -0,0 +1,12 @@
# Arguments
input: # please fill in - example: "input.h5mu"
modality: "rna"
obsp_connectivities: "connectivities"
# output: "$id.$key.output.h5mu"
obsm_name: "leiden"
resolution: # please fill in - example: [1.0]
# output_compression: "gzip"
# Nextflow input-output arguments
publish_dir: # please fill in - example: "output/"
# param_list: "my_params.yaml"

View File

@@ -0,0 +1,101 @@
{
"$schema": "https://json-schema.org/draft/2020-12/schema",
"title": "leiden",
"description": "Cluster cells using the [Leiden algorithm] [Traag18] implemented in the [Scanpy framework] [Wolf18]. \nLeiden is an improved version of the [Louvain algorithm] [Blondel08]. \nIt has been proposed for single-cell analysis by [Levine15] [Levine15]. \nThis requires having ran `neighbors/find_neighbors` or `neighbors/bbknn` first.\n\n[Blondel08]: Blondel et al. (2008), Fast unfolding of communities in large networks, J. Stat. Mech. \n[Levine15]: Levine et al. (2015), Data-Driven Phenotypic Dissection of AML Reveals Progenitor-like Cells that Correlate with Prognosis, Cell. \n[Traag18]: Traag et al. (2018), From Louvain to Leiden: guaranteeing well-connected communities arXiv. \n[Wolf18]: Wolf et al. (2018), Scanpy: large-scale single-cell gene expression data analysis, Genome Biology. \n",
"type": "object",
"$defs": {
"Dataset input": {
"title": "Dataset input",
"type": "object",
"description": "Dataset input using nf-tower \"dataset\" or \"data explorer\". Allows for the input of multiple parameter sets to initialise a Nextflow channel.",
"properties": {
"param_list": {
"description": "Dataset input can either be a list of maps, a csv file, a json file, a yaml file, or simply a yaml blob. The names of the input fields (e.g. csv columns, json keys) need to be an exact match with the workflow input parameters.",
"type": "string",
"default": "",
"format": "file-path",
"mimetype": "text/csv"
}
}
},
"arguments": {
"title": "Arguments",
"type": "object",
"description": "No description",
"properties": {
"input": {
"type": "string",
"format": "path",
"exists": true,
"description": "Input file.",
"help_text": "Type: `file`, multiple: `False`, required, direction: `input`, example: `\"input.h5mu\"`. "
},
"modality": {
"type": "string",
"description": "Which modality from the input MuData file to process.\n",
"help_text": "Type: `string`, multiple: `False`, default: `\"rna\"`. ",
"default": "rna"
},
"obsp_connectivities": {
"type": "string",
"description": "In which .obsp slot the neighbor connectivities can be found.",
"help_text": "Type: `string`, multiple: `False`, default: `\"connectivities\"`. ",
"default": "connectivities"
},
"output": {
"type": "string",
"format": "path",
"description": "Output file.",
"help_text": "Type: `file`, multiple: `False`, required, default: `\"$id.$key.output.h5mu\"`, direction: `output`, example: `\"output.h5mu\"`. ",
"default": "$id.$key.output.h5mu"
},
"obsm_name": {
"type": "string",
"description": "Name of the .obsm key under which to add the cluster labels.\nThe name of the columns in the matrix will correspond to the resolutions.\n",
"help_text": "Type: `string`, multiple: `False`, default: `\"leiden\"`. ",
"default": "leiden"
},
"resolution": {
"type": "array",
"items": {
"type": "number"
},
"description": "A parameter value controlling the coarseness of the clustering",
"help_text": "Type: `double`, multiple: `True`, required, default: `[1.0]`. ",
"default": [
1.0
]
},
"output_compression": {
"type": "string",
"description": "Compression format to use for the output AnnData and/or Mudata objects.\nBy default no compression is applied.\n",
"help_text": "Type: `string`, multiple: `False`, example: `\"gzip\"`, choices: ``gzip`, `lzf``. ",
"enum": [
"gzip",
"lzf"
]
}
}
},
"nextflow input-output arguments": {
"title": "Nextflow input-output arguments",
"type": "object",
"description": "Input/output parameters for Nextflow itself. Please note that both publishDir and publish_dir are supported but at least one has to be configured.",
"properties": {
"publish_dir": {
"type": "string",
"description": "Path to an output directory.",
"help_text": "Type: `string`, multiple: `False`, required, example: `\"output/\"`. "
}
}
}
},
"allOf": [
{
"$ref": "#/$defs/arguments"
},
{
"$ref": "#/$defs/nextflow input-output arguments"
}
]
}

View File

@@ -0,0 +1,12 @@
def setup_logger():
import logging
from sys import stdout
logger = logging.getLogger()
logger.setLevel(logging.INFO)
console_handler = logging.StreamHandler(stdout)
logFormatter = logging.Formatter("%(asctime)s %(levelname)-8s %(message)s")
console_handler.setFormatter(logFormatter)
logger.addHandler(console_handler)
return logger

View File

@@ -0,0 +1,332 @@
name: "concatenate_h5mu"
namespace: "dataflow"
version: "3.0.0"
authors:
- name: "Dries Schaumont"
roles:
- "maintainer"
info:
role: "Core Team Member"
links:
email: "dries@data-intuitive.com"
github: "DriesSchaumont"
orcid: "0000-0002-4389-0440"
linkedin: "dries-schaumont"
organizations:
- name: "Data Intuitive"
href: "https://www.data-intuitive.com"
role: "Data Scientist"
argument_groups:
- name: "Arguments"
arguments:
- type: "file"
name: "--input"
alternatives:
- "-i"
description: "Paths to the different samples to be concatenated."
info: null
example:
- "sample_paths"
must_exist: true
create_parent: true
required: true
direction: "input"
multiple: true
multiple_sep: ";"
- type: "string"
name: "--modality"
description: "Only output concatenated objects for the provided modalities. Outputs\
\ all modalities by default."
info: null
required: false
direction: "input"
multiple: true
multiple_sep: ";"
- type: "string"
name: "--input_id"
description: "Names of the different samples that have to be concatenated. Must\
\ be specified when using '--mode move'.\nIn this case, the ids will be used\
\ for the columns names of the dataframes registring the conflicts.\nIf specified,\
\ must be of same length as `--input`.\n"
info: null
required: false
direction: "input"
multiple: true
multiple_sep: ";"
- type: "file"
name: "--output"
alternatives:
- "-o"
description: "Output location for the concatenated MuData object file.\n"
info: null
example:
- "output.h5mu"
must_exist: true
create_parent: true
required: false
direction: "output"
multiple: false
multiple_sep: ";"
- type: "string"
name: "--obs_sample_name"
description: "Name of the .obs key under which to add the sample names."
info: null
default:
- "sample_id"
required: false
direction: "input"
multiple: false
multiple_sep: ";"
- type: "string"
name: "--other_axis_mode"
description: "How to handle the merging of other axis (var, obs, ...).\n\n -\
\ None: keep no data\n - same: only keep elements of the matrices which are\
\ the same in each of the samples\n - unique: only keep elements for which\
\ there is only 1 possible value (1 value that can occur in multiple samples)\n\
\ - first: keep the annotation from the first sample\n - only: keep elements\
\ that show up in only one of the objects (1 unique element in only 1 sample)\n\
\ - move: identical to 'same', but moving the conflicting values to .varm or\
\ .obsm\n"
info: null
default:
- "move"
required: false
choices:
- "same"
- "unique"
- "first"
- "only"
- "concat"
- "move"
direction: "input"
multiple: false
multiple_sep: ";"
- type: "string"
name: "--uns_merge_mode"
description: "How to handle the merging of .uns across modalities\n - None: keep\
\ no data\n - same: only keep elements of the matrices which are the same in\
\ each of the samples\n - unique: only keep elements for which there is only\
\ 1 possible value (1 value that can occur in multiple samples)\n - first:\
\ keep the annotation from the first sample\n - only: keep elements that show\
\ up in only one of the objects (1 unique element in only 1 sample)\n - make_unique:\
\ identical to 'unique', but keys which are not unique are made unique by prefixing\
\ them with the sample id.\n"
info: null
default:
- "make_unique"
required: false
choices:
- "same"
- "unique"
- "first"
- "only"
- "make_unique"
direction: "input"
multiple: false
multiple_sep: ";"
- type: "string"
name: "--output_compression"
description: "Compression format to use for the output AnnData and/or Mudata objects.\n\
By default no compression is applied.\n"
info: null
example:
- "gzip"
required: false
choices:
- "gzip"
- "lzf"
direction: "input"
multiple: false
multiple_sep: ";"
resources:
- type: "python_script"
path: "script.py"
is_executable: true
- type: "file"
path: "setup_logger.py"
- type: "file"
path: "compress_h5mu.py"
- type: "file"
path: "nextflow_labels.config"
dest: "nextflow_labels.config"
description: "Concatenate observations from samples in several (uni- and/or multi-modal)\
\ MuData files into a single file.\n"
test_resources:
- type: "python_script"
path: "test.py"
is_executable: true
- type: "file"
path: "e18_mouse_brain_fresh_5k_filtered_feature_bc_matrix_subset_unique_obs.h5mu"
- type: "file"
path: "human_brain_3k_filtered_feature_bc_matrix_subset_unique_obs.h5mu"
info: null
status: "enabled"
scope:
image: "public"
target: "public"
license: "MIT"
links:
repository: "https://github.com/openpipelines-bio/openpipeline"
docker_registry: "ghcr.io"
runners:
- type: "executable"
id: "executable"
docker_setup_strategy: "ifneedbepullelsecachedbuild"
- type: "nextflow"
id: "nextflow"
directives:
label:
- "midcpu"
- "highmem"
tag: "$id"
auto:
simplifyInput: true
simplifyOutput: false
transcript: false
publish: false
config:
labels:
mem1gb: "memory = 1000000000.B"
mem2gb: "memory = 2000000000.B"
mem5gb: "memory = 5000000000.B"
mem10gb: "memory = 10000000000.B"
mem20gb: "memory = 20000000000.B"
mem50gb: "memory = 50000000000.B"
mem100gb: "memory = 100000000000.B"
mem200gb: "memory = 200000000000.B"
mem500gb: "memory = 500000000000.B"
mem1tb: "memory = 1000000000000.B"
mem2tb: "memory = 2000000000000.B"
mem5tb: "memory = 5000000000000.B"
mem10tb: "memory = 10000000000000.B"
mem20tb: "memory = 20000000000000.B"
mem50tb: "memory = 50000000000000.B"
mem100tb: "memory = 100000000000000.B"
mem200tb: "memory = 200000000000000.B"
mem500tb: "memory = 500000000000000.B"
mem1gib: "memory = 1073741824.B"
mem2gib: "memory = 2147483648.B"
mem4gib: "memory = 4294967296.B"
mem8gib: "memory = 8589934592.B"
mem16gib: "memory = 17179869184.B"
mem32gib: "memory = 34359738368.B"
mem64gib: "memory = 68719476736.B"
mem128gib: "memory = 137438953472.B"
mem256gib: "memory = 274877906944.B"
mem512gib: "memory = 549755813888.B"
mem1tib: "memory = 1099511627776.B"
mem2tib: "memory = 2199023255552.B"
mem4tib: "memory = 4398046511104.B"
mem8tib: "memory = 8796093022208.B"
mem16tib: "memory = 17592186044416.B"
mem32tib: "memory = 35184372088832.B"
mem64tib: "memory = 70368744177664.B"
mem128tib: "memory = 140737488355328.B"
mem256tib: "memory = 281474976710656.B"
mem512tib: "memory = 562949953421312.B"
cpu1: "cpus = 1"
cpu2: "cpus = 2"
cpu5: "cpus = 5"
cpu10: "cpus = 10"
cpu20: "cpus = 20"
cpu50: "cpus = 50"
cpu100: "cpus = 100"
cpu200: "cpus = 200"
cpu500: "cpus = 500"
cpu1000: "cpus = 1000"
script:
- "includeConfig(\"nextflow_labels.config\")"
debug: false
container: "docker"
engines:
- type: "docker"
id: "docker"
image: "python:3.11-slim"
target_tag: "3.0.0"
namespace_separator: "/"
setup:
- type: "apt"
packages:
- "procps"
interactive: false
- type: "python"
user: false
packages:
- "anndata~=0.11.1"
- "mudata~=0.3.1"
- "pandas~=2.1.1"
script:
- "exec(\"try:\\n import awkward\\nexcept ModuleNotFoundError:\\n exit(0)\\\
nelse: exit(1)\")"
upgrade: true
test_setup:
- type: "apt"
packages:
- "git"
interactive: false
- type: "python"
user: false
packages:
- "viashpy==0.8.0"
github:
- "openpipelines-bio/core#subdirectory=packages/python/openpipeline_testutils"
upgrade: true
- type: "python"
user: false
packages:
- "viashpy==0.8.0"
upgrade: true
entrypoint: []
cmd: null
build_info:
config: "src/dataflow/concatenate_h5mu/config.vsh.yaml"
runner: "nextflow"
engine: "docker"
output: "target/nextflow/dataflow/concatenate_h5mu"
executable: "target/nextflow/dataflow/concatenate_h5mu/main.nf"
viash_version: "0.9.4"
git_commit: "706b5ce24d313dcf947b7d9fe929630f1ad204e7"
git_remote: "https://github.com/openpipelines-bio/openpipeline"
git_tag: "2.1.0-2-g706b5ce24d3"
package_config:
name: "openpipeline"
summary: "Best-practice workflows for single-cell multi-omics analyses.\n"
description: "OpenPipelines are extensible single cell analysis pipelines for reproducible\
\ and large-scale single cell processing using [Viash](https://viash.io) and [Nextflow](https://www.nextflow.io/).\n\
\nIn terms of workflows, the following has been made available, but keep in mind\
\ that\nindividual tools and functionality can be executed as standalone components\
\ as well.\n\n * Demultiplexing: conversion of raw sequencing data to FASTQ objects.\n\
\ * Ingestion: Read mapping and generating a count matrix.\n * Single sample\
\ processing: cell filtering and doublet detection.\n * Multisample processing:\
\ Count transformation, normalization, QC metric calulations.\n * Integration:\
\ Clustering, integration and batch correction using single and multimodal methods.\n\
\ * Downstream analysis workflows\n"
info:
test_resources:
- type: "s3"
path: "s3://openpipelines-data"
dest: "resources_test"
nextflow_labels_ci:
- path: "src/workflows/utils/labels_ci.config"
description: "Adds the correct memory and CPU labels when running on the Viash\
\ Hub CI."
viash_version: "0.9.4"
source: "src"
target: "target"
config_mods:
- ".resources += {path: '/src/workflows/utils/labels.config', dest: 'nextflow_labels.config'}\n\
.runners[.type == 'nextflow'].config.script := 'includeConfig(\"nextflow_labels.config\"\
)'"
- ".version := \"3.0.0\""
- ".engines[.type == 'docker'].target_tag := '3.0.0'"
keywords:
- "single-cell"
- "multimodal"
license: "MIT"
organization: "openpipelines-bio"
links:
repository: "https://github.com/openpipelines-bio/openpipeline"
docker_registry: "ghcr.io"
homepage: "https://openpipelines.bio"
documentation: "https://openpipelines.bio/fundamentals"
issue_tracker: "https://github.com/openpipelines-bio/openpipeline/issues"

View File

@@ -0,0 +1,87 @@
import shutil
from anndata import AnnData
from mudata import write_h5ad
from h5py import File as H5File
from h5py import Group, Dataset
from pathlib import Path
from typing import Union, Literal
from functools import partial
def compress_h5mu(
input_path: Union[str, Path],
output_path: Union[str, Path],
compression: Union[Literal["gzip"], Literal["lzf"]],
):
input_path, output_path = str(input_path), str(output_path)
def copy_attributes(in_object, out_object):
for key, value in in_object.attrs.items():
out_object.attrs[key] = value
def visit_path(
output_h5: H5File,
compression: Union[Literal["gzip"], Literal["lzf"]],
name: str,
object: Union[Group, Dataset],
):
if isinstance(object, Group):
new_group = output_h5.create_group(name)
copy_attributes(object, new_group)
elif isinstance(object, Dataset):
# Compression only works for non-scalar Dataset objects
# Scalar objects dont have a shape defined
if not object.compression and object.shape not in [None, ()]:
new_dataset = output_h5.create_dataset(
name, data=object, compression=compression
)
copy_attributes(object, new_dataset)
else:
output_h5.copy(object, name)
else:
raise NotImplementedError(
f"Could not copy element {name}, "
f"type has not been implemented yet: {type(object)}"
)
with (
H5File(input_path, "r") as input_h5,
H5File(output_path, "w", userblock_size=512) as output_h5,
):
copy_attributes(input_h5, output_h5)
input_h5.visititems(partial(visit_path, output_h5, compression))
with open(input_path, "rb") as input_bytes:
# Mudata puts metadata like this in the first 512 bytes:
# MuData (format-version=0.1.0;creator=muon;creator-version=0.2.0)
# See mudata/_core/io.py, read_h5mu() function
starting_metadata = input_bytes.read(100)
# The metadata is padded with extra null bytes up until 512 bytes
truncate_location = starting_metadata.find(b"\x00")
starting_metadata = starting_metadata[:truncate_location]
with open(output_path, "br+") as f:
nbytes = f.write(starting_metadata)
f.write(b"\0" * (512 - nbytes))
def write_h5ad_to_h5mu_with_compression(
output_file: Union[str, Path],
h5mu: Union[str, Path],
modality_name: str,
modality_data: AnnData,
output_compression=None,
):
output_file = Path(output_file)
h5mu = Path(h5mu)
output_file_uncompressed = (
output_file.with_name(output_file.stem + "_uncompressed.h5mu")
if output_compression
else output_file
)
shutil.copyfile(h5mu, output_file_uncompressed)
write_h5ad(filename=output_file_uncompressed, mod=modality_name, data=modality_data)
if output_compression:
compress_h5mu(
output_file_uncompressed, output_file, compression=output_compression
)
output_file_uncompressed.unlink()

View File

@@ -0,0 +1,126 @@
manifest {
name = 'dataflow/concatenate_h5mu'
mainScript = 'main.nf'
nextflowVersion = '!>=20.12.1-edge'
version = '3.0.0'
description = 'Concatenate observations from samples in several (uni- and/or multi-modal) MuData files into a single file.\n'
author = 'Dries Schaumont'
}
process.container = 'nextflow/bash:latest'
// detect tempdir
tempDir = java.nio.file.Paths.get(
System.getenv('NXF_TEMP') ?:
System.getenv('VIASH_TEMP') ?:
System.getenv('TEMPDIR') ?:
System.getenv('TMPDIR') ?:
'/tmp'
).toAbsolutePath()
profiles {
no_publish {
process {
withName: '.*' {
publishDir = [
enabled: false
]
}
}
}
mount_temp {
docker.temp = tempDir
podman.temp = tempDir
charliecloud.temp = tempDir
}
docker {
docker.enabled = true
// docker.userEmulation = true
singularity.enabled = false
podman.enabled = false
shifter.enabled = false
charliecloud.enabled = false
}
singularity {
singularity.enabled = true
singularity.autoMounts = true
docker.enabled = false
podman.enabled = false
shifter.enabled = false
charliecloud.enabled = false
}
podman {
podman.enabled = true
docker.enabled = false
singularity.enabled = false
shifter.enabled = false
charliecloud.enabled = false
}
shifter {
shifter.enabled = true
docker.enabled = false
singularity.enabled = false
podman.enabled = false
charliecloud.enabled = false
}
charliecloud {
charliecloud.enabled = true
docker.enabled = false
singularity.enabled = false
podman.enabled = false
shifter.enabled = false
}
}
process{
withLabel: mem1gb { memory = 1000000000.B }
withLabel: mem2gb { memory = 2000000000.B }
withLabel: mem5gb { memory = 5000000000.B }
withLabel: mem10gb { memory = 10000000000.B }
withLabel: mem20gb { memory = 20000000000.B }
withLabel: mem50gb { memory = 50000000000.B }
withLabel: mem100gb { memory = 100000000000.B }
withLabel: mem200gb { memory = 200000000000.B }
withLabel: mem500gb { memory = 500000000000.B }
withLabel: mem1tb { memory = 1000000000000.B }
withLabel: mem2tb { memory = 2000000000000.B }
withLabel: mem5tb { memory = 5000000000000.B }
withLabel: mem10tb { memory = 10000000000000.B }
withLabel: mem20tb { memory = 20000000000000.B }
withLabel: mem50tb { memory = 50000000000000.B }
withLabel: mem100tb { memory = 100000000000000.B }
withLabel: mem200tb { memory = 200000000000000.B }
withLabel: mem500tb { memory = 500000000000000.B }
withLabel: mem1gib { memory = 1073741824.B }
withLabel: mem2gib { memory = 2147483648.B }
withLabel: mem4gib { memory = 4294967296.B }
withLabel: mem8gib { memory = 8589934592.B }
withLabel: mem16gib { memory = 17179869184.B }
withLabel: mem32gib { memory = 34359738368.B }
withLabel: mem64gib { memory = 68719476736.B }
withLabel: mem128gib { memory = 137438953472.B }
withLabel: mem256gib { memory = 274877906944.B }
withLabel: mem512gib { memory = 549755813888.B }
withLabel: mem1tib { memory = 1099511627776.B }
withLabel: mem2tib { memory = 2199023255552.B }
withLabel: mem4tib { memory = 4398046511104.B }
withLabel: mem8tib { memory = 8796093022208.B }
withLabel: mem16tib { memory = 17592186044416.B }
withLabel: mem32tib { memory = 35184372088832.B }
withLabel: mem64tib { memory = 70368744177664.B }
withLabel: mem128tib { memory = 140737488355328.B }
withLabel: mem256tib { memory = 281474976710656.B }
withLabel: mem512tib { memory = 562949953421312.B }
withLabel: cpu1 { cpus = 1 }
withLabel: cpu2 { cpus = 2 }
withLabel: cpu5 { cpus = 5 }
withLabel: cpu10 { cpus = 10 }
withLabel: cpu20 { cpus = 20 }
withLabel: cpu50 { cpus = 50 }
withLabel: cpu100 { cpus = 100 }
withLabel: cpu200 { cpus = 200 }
withLabel: cpu500 { cpus = 500 }
withLabel: cpu1000 { cpus = 1000 }
}
includeConfig("nextflow_labels.config")

View File

@@ -0,0 +1,48 @@
process {
// Default resources for components that hardly do any processing
memory = { 2.GB * task.attempt }
cpus = 1
// Retry for exit codes that have something to do with memory issues
errorStrategy = { task.exitStatus in 137..140 ? 'retry' : 'terminate' }
maxRetries = 3
// The memory a task is assinged increases with each attempt
// uncomment the line below and adjust the value to set a global upper limit on the memory.
// resourceLimits = [ memory: 240.Gb ]
// CPU resources
withLabel: singlecpu { cpus = 1 }
withLabel: lowcpu { cpus = 4 }
withLabel: midcpu { cpus = 10 }
withLabel: highcpu { cpus = 20 }
// Memory resources
withLabel: lowmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 4.GB * task.attempt } }
withLabel: midmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 25.GB * task.attempt } }
withLabel: highmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 50.GB * task.attempt } }
withLabel: veryhighmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 75.GB * task.attempt } }
// Disk space
withLabel: lowdisk {
disk = {process.disk ? process.disk : null}
}
withLabel: middisk {
disk = {process.disk ? process.disk : null}
}
withLabel: highdisk {
disk = {process.disk ? process.disk : null}
}
withLabel: veryhighdisk {
disk = {process.disk ? process.disk : null}
}
// NOTE: The above labels intentionally do not have an effect by default.
// The user should set the disk space requirements by adding the following
// to the compute environment:
//
// withLabel: lowdisk { disk = { 20.GB * task.attempt } }
// withLabel: middisk { disk = { 100.GB * task.attempt } }
// withLabel: highdisk { disk = { 200.GB * task.attempt } }
// withLabel: veryhighdisk { disk = { 500.GB * task.attempt } }
}

View File

@@ -0,0 +1,13 @@
# Arguments
input: # please fill in - example: ["sample_paths"]
# modality: ["foo"]
# input_id: ["foo"]
# output: "$id.$key.output.h5mu"
obs_sample_name: "sample_id"
other_axis_mode: "move"
uns_merge_mode: "make_unique"
# output_compression: "gzip"
# Nextflow input-output arguments
publish_dir: # please fill in - example: "output/"
# param_list: "my_params.yaml"

View File

@@ -0,0 +1,124 @@
{
"$schema": "https://json-schema.org/draft/2020-12/schema",
"title": "concatenate_h5mu",
"description": "Concatenate observations from samples in several (uni- and/or multi-modal) MuData files into a single file.\n",
"type": "object",
"$defs": {
"Dataset input": {
"title": "Dataset input",
"type": "object",
"description": "Dataset input using nf-tower \"dataset\" or \"data explorer\". Allows for the input of multiple parameter sets to initialise a Nextflow channel.",
"properties": {
"param_list": {
"description": "Dataset input can either be a list of maps, a csv file, a json file, a yaml file, or simply a yaml blob. The names of the input fields (e.g. csv columns, json keys) need to be an exact match with the workflow input parameters.",
"type": "string",
"default": "",
"format": "file-path",
"mimetype": "text/csv"
}
}
},
"arguments": {
"title": "Arguments",
"type": "object",
"description": "No description",
"properties": {
"input": {
"type": "array",
"items": {
"type": "string"
},
"format": "path",
"exists": true,
"description": "Paths to the different samples to be concatenated.",
"help_text": "Type: `file`, multiple: `True`, required, direction: `input`, example: `[\"sample_paths\"]`. "
},
"modality": {
"type": "array",
"items": {
"type": "string"
},
"description": "Only output concatenated objects for the provided modalities",
"help_text": "Type: `string`, multiple: `True`. "
},
"input_id": {
"type": "array",
"items": {
"type": "string"
},
"description": "Names of the different samples that have to be concatenated",
"help_text": "Type: `string`, multiple: `True`. "
},
"output": {
"type": "string",
"format": "path",
"description": "Output location for the concatenated MuData object file.\n",
"help_text": "Type: `file`, multiple: `False`, default: `\"$id.$key.output.h5mu\"`, direction: `output`, example: `\"output.h5mu\"`. ",
"default": "$id.$key.output.h5mu"
},
"obs_sample_name": {
"type": "string",
"description": "Name of the .obs key under which to add the sample names.",
"help_text": "Type: `string`, multiple: `False`, default: `\"sample_id\"`. ",
"default": "sample_id"
},
"other_axis_mode": {
"type": "string",
"description": "How to handle the merging of other axis (var, obs, ...).\n\n - None: keep no data\n - same: only keep elements of the matrices which are the same in each of the samples\n - unique: only keep elements for which there is only 1 possible value (1 value that can occur in multiple samples)\n - first: keep the annotation from the first sample\n - only: keep elements that show up in only one of the objects (1 unique element in only 1 sample)\n - move: identical to 'same', but moving the conflicting values to .varm or .obsm\n",
"help_text": "Type: `string`, multiple: `False`, default: `\"move\"`, choices: ``same`, `unique`, `first`, `only`, `concat`, `move``. ",
"enum": [
"same",
"unique",
"first",
"only",
"concat",
"move"
],
"default": "move"
},
"uns_merge_mode": {
"type": "string",
"description": "How to handle the merging of .uns across modalities\n - None: keep no data\n - same: only keep elements of the matrices which are the same in each of the samples\n - unique: only keep elements for which there is only 1 possible value (1 value that can occur in multiple samples)\n - first: keep the annotation from the first sample\n - only: keep elements that show up in only one of the objects (1 unique element in only 1 sample)\n - make_unique: identical to 'unique', but keys which are not unique are made unique by prefixing them with the sample id.\n",
"help_text": "Type: `string`, multiple: `False`, default: `\"make_unique\"`, choices: ``same`, `unique`, `first`, `only`, `make_unique``. ",
"enum": [
"same",
"unique",
"first",
"only",
"make_unique"
],
"default": "make_unique"
},
"output_compression": {
"type": "string",
"description": "Compression format to use for the output AnnData and/or Mudata objects.\nBy default no compression is applied.\n",
"help_text": "Type: `string`, multiple: `False`, example: `\"gzip\"`, choices: ``gzip`, `lzf``. ",
"enum": [
"gzip",
"lzf"
]
}
}
},
"nextflow input-output arguments": {
"title": "Nextflow input-output arguments",
"type": "object",
"description": "Input/output parameters for Nextflow itself. Please note that both publishDir and publish_dir are supported but at least one has to be configured.",
"properties": {
"publish_dir": {
"type": "string",
"description": "Path to an output directory.",
"help_text": "Type: `string`, multiple: `False`, required, example: `\"output/\"`. "
}
}
}
},
"allOf": [
{
"$ref": "#/$defs/arguments"
},
{
"$ref": "#/$defs/nextflow input-output arguments"
}
]
}

View File

@@ -0,0 +1,12 @@
def setup_logger():
import logging
from sys import stdout
logger = logging.getLogger()
logger.setLevel(logging.INFO)
console_handler = logging.StreamHandler(stdout)
logFormatter = logging.Formatter("%(asctime)s %(levelname)-8s %(message)s")
console_handler.setFormatter(logFormatter)
logger.addHandler(console_handler)
return logger

View File

@@ -0,0 +1,246 @@
name: "merge"
namespace: "dataflow"
version: "3.0.0"
authors:
- name: "Dries Schaumont"
roles:
- "maintainer"
info:
role: "Core Team Member"
links:
email: "dries@data-intuitive.com"
github: "DriesSchaumont"
orcid: "0000-0002-4389-0440"
linkedin: "dries-schaumont"
organizations:
- name: "Data Intuitive"
href: "https://www.data-intuitive.com"
role: "Data Scientist"
argument_groups:
- name: "Arguments"
arguments:
- type: "file"
name: "--input"
alternatives:
- "-i"
description: "Paths to the single-modality .h5mu files that need to be combined"
info: null
default:
- "sample_paths"
must_exist: true
create_parent: true
required: true
direction: "input"
multiple: true
multiple_sep: ";"
- type: "file"
name: "--output"
alternatives:
- "-o"
description: "Path to the output file."
info: null
default:
- "output.h5mu"
must_exist: true
create_parent: true
required: false
direction: "output"
multiple: false
multiple_sep: ";"
- type: "string"
name: "--output_compression"
description: "The compression format to be used on the output h5mu object."
info: null
example:
- "gzip"
required: false
choices:
- "gzip"
- "lzf"
direction: "input"
multiple: false
multiple_sep: ";"
resources:
- type: "python_script"
path: "script.py"
is_executable: true
- type: "file"
path: "setup_logger.py"
- type: "file"
path: "nextflow_labels.config"
dest: "nextflow_labels.config"
description: "Combine one or more single-modality .h5mu files together into one .h5mu\
\ file.\n"
test_resources:
- type: "python_script"
path: "test.py"
is_executable: true
- type: "file"
path: "pbmc_1k_protein_v3_filtered_feature_bc_matrix_rna.h5mu"
- type: "file"
path: "pbmc_1k_protein_v3_filtered_feature_bc_matrix_prot.h5mu"
info: null
status: "enabled"
scope:
image: "public"
target: "public"
license: "MIT"
links:
repository: "https://github.com/openpipelines-bio/openpipeline"
docker_registry: "ghcr.io"
runners:
- type: "executable"
id: "executable"
docker_setup_strategy: "ifneedbepullelsecachedbuild"
- type: "nextflow"
id: "nextflow"
directives:
label:
- "singlecpu"
- "highmem"
tag: "$id"
auto:
simplifyInput: true
simplifyOutput: false
transcript: false
publish: false
config:
labels:
mem1gb: "memory = 1000000000.B"
mem2gb: "memory = 2000000000.B"
mem5gb: "memory = 5000000000.B"
mem10gb: "memory = 10000000000.B"
mem20gb: "memory = 20000000000.B"
mem50gb: "memory = 50000000000.B"
mem100gb: "memory = 100000000000.B"
mem200gb: "memory = 200000000000.B"
mem500gb: "memory = 500000000000.B"
mem1tb: "memory = 1000000000000.B"
mem2tb: "memory = 2000000000000.B"
mem5tb: "memory = 5000000000000.B"
mem10tb: "memory = 10000000000000.B"
mem20tb: "memory = 20000000000000.B"
mem50tb: "memory = 50000000000000.B"
mem100tb: "memory = 100000000000000.B"
mem200tb: "memory = 200000000000000.B"
mem500tb: "memory = 500000000000000.B"
mem1gib: "memory = 1073741824.B"
mem2gib: "memory = 2147483648.B"
mem4gib: "memory = 4294967296.B"
mem8gib: "memory = 8589934592.B"
mem16gib: "memory = 17179869184.B"
mem32gib: "memory = 34359738368.B"
mem64gib: "memory = 68719476736.B"
mem128gib: "memory = 137438953472.B"
mem256gib: "memory = 274877906944.B"
mem512gib: "memory = 549755813888.B"
mem1tib: "memory = 1099511627776.B"
mem2tib: "memory = 2199023255552.B"
mem4tib: "memory = 4398046511104.B"
mem8tib: "memory = 8796093022208.B"
mem16tib: "memory = 17592186044416.B"
mem32tib: "memory = 35184372088832.B"
mem64tib: "memory = 70368744177664.B"
mem128tib: "memory = 140737488355328.B"
mem256tib: "memory = 281474976710656.B"
mem512tib: "memory = 562949953421312.B"
cpu1: "cpus = 1"
cpu2: "cpus = 2"
cpu5: "cpus = 5"
cpu10: "cpus = 10"
cpu20: "cpus = 20"
cpu50: "cpus = 50"
cpu100: "cpus = 100"
cpu200: "cpus = 200"
cpu500: "cpus = 500"
cpu1000: "cpus = 1000"
script:
- "includeConfig(\"nextflow_labels.config\")"
debug: false
container: "docker"
engines:
- type: "docker"
id: "docker"
image: "python:3.12-slim"
target_tag: "3.0.0"
namespace_separator: "/"
setup:
- type: "apt"
packages:
- "procps"
interactive: false
- type: "python"
user: false
packages:
- "anndata~=0.11.1"
- "mudata~=0.3.1"
script:
- "exec(\"try:\\n import awkward\\nexcept ModuleNotFoundError:\\n exit(0)\\\
nelse: exit(1)\")"
upgrade: true
test_setup:
- type: "apt"
packages:
- "git"
interactive: false
- type: "python"
user: false
packages:
- "viashpy==0.8.0"
github:
- "openpipelines-bio/core#subdirectory=packages/python/openpipeline_testutils"
upgrade: true
entrypoint: []
cmd: null
build_info:
config: "src/dataflow/merge/config.vsh.yml"
runner: "nextflow"
engine: "docker"
output: "target/nextflow/dataflow/merge"
executable: "target/nextflow/dataflow/merge/main.nf"
viash_version: "0.9.4"
git_commit: "706b5ce24d313dcf947b7d9fe929630f1ad204e7"
git_remote: "https://github.com/openpipelines-bio/openpipeline"
git_tag: "2.1.0-2-g706b5ce24d3"
package_config:
name: "openpipeline"
summary: "Best-practice workflows for single-cell multi-omics analyses.\n"
description: "OpenPipelines are extensible single cell analysis pipelines for reproducible\
\ and large-scale single cell processing using [Viash](https://viash.io) and [Nextflow](https://www.nextflow.io/).\n\
\nIn terms of workflows, the following has been made available, but keep in mind\
\ that\nindividual tools and functionality can be executed as standalone components\
\ as well.\n\n * Demultiplexing: conversion of raw sequencing data to FASTQ objects.\n\
\ * Ingestion: Read mapping and generating a count matrix.\n * Single sample\
\ processing: cell filtering and doublet detection.\n * Multisample processing:\
\ Count transformation, normalization, QC metric calulations.\n * Integration:\
\ Clustering, integration and batch correction using single and multimodal methods.\n\
\ * Downstream analysis workflows\n"
info:
test_resources:
- type: "s3"
path: "s3://openpipelines-data"
dest: "resources_test"
nextflow_labels_ci:
- path: "src/workflows/utils/labels_ci.config"
description: "Adds the correct memory and CPU labels when running on the Viash\
\ Hub CI."
viash_version: "0.9.4"
source: "src"
target: "target"
config_mods:
- ".resources += {path: '/src/workflows/utils/labels.config', dest: 'nextflow_labels.config'}\n\
.runners[.type == 'nextflow'].config.script := 'includeConfig(\"nextflow_labels.config\"\
)'"
- ".version := \"3.0.0\""
- ".engines[.type == 'docker'].target_tag := '3.0.0'"
keywords:
- "single-cell"
- "multimodal"
license: "MIT"
organization: "openpipelines-bio"
links:
repository: "https://github.com/openpipelines-bio/openpipeline"
docker_registry: "ghcr.io"
homepage: "https://openpipelines.bio"
documentation: "https://openpipelines.bio/fundamentals"
issue_tracker: "https://github.com/openpipelines-bio/openpipeline/issues"

View File

@@ -0,0 +1,126 @@
manifest {
name = 'dataflow/merge'
mainScript = 'main.nf'
nextflowVersion = '!>=20.12.1-edge'
version = '3.0.0'
description = 'Combine one or more single-modality .h5mu files together into one .h5mu file.\n'
author = 'Dries Schaumont'
}
process.container = 'nextflow/bash:latest'
// detect tempdir
tempDir = java.nio.file.Paths.get(
System.getenv('NXF_TEMP') ?:
System.getenv('VIASH_TEMP') ?:
System.getenv('TEMPDIR') ?:
System.getenv('TMPDIR') ?:
'/tmp'
).toAbsolutePath()
profiles {
no_publish {
process {
withName: '.*' {
publishDir = [
enabled: false
]
}
}
}
mount_temp {
docker.temp = tempDir
podman.temp = tempDir
charliecloud.temp = tempDir
}
docker {
docker.enabled = true
// docker.userEmulation = true
singularity.enabled = false
podman.enabled = false
shifter.enabled = false
charliecloud.enabled = false
}
singularity {
singularity.enabled = true
singularity.autoMounts = true
docker.enabled = false
podman.enabled = false
shifter.enabled = false
charliecloud.enabled = false
}
podman {
podman.enabled = true
docker.enabled = false
singularity.enabled = false
shifter.enabled = false
charliecloud.enabled = false
}
shifter {
shifter.enabled = true
docker.enabled = false
singularity.enabled = false
podman.enabled = false
charliecloud.enabled = false
}
charliecloud {
charliecloud.enabled = true
docker.enabled = false
singularity.enabled = false
podman.enabled = false
shifter.enabled = false
}
}
process{
withLabel: mem1gb { memory = 1000000000.B }
withLabel: mem2gb { memory = 2000000000.B }
withLabel: mem5gb { memory = 5000000000.B }
withLabel: mem10gb { memory = 10000000000.B }
withLabel: mem20gb { memory = 20000000000.B }
withLabel: mem50gb { memory = 50000000000.B }
withLabel: mem100gb { memory = 100000000000.B }
withLabel: mem200gb { memory = 200000000000.B }
withLabel: mem500gb { memory = 500000000000.B }
withLabel: mem1tb { memory = 1000000000000.B }
withLabel: mem2tb { memory = 2000000000000.B }
withLabel: mem5tb { memory = 5000000000000.B }
withLabel: mem10tb { memory = 10000000000000.B }
withLabel: mem20tb { memory = 20000000000000.B }
withLabel: mem50tb { memory = 50000000000000.B }
withLabel: mem100tb { memory = 100000000000000.B }
withLabel: mem200tb { memory = 200000000000000.B }
withLabel: mem500tb { memory = 500000000000000.B }
withLabel: mem1gib { memory = 1073741824.B }
withLabel: mem2gib { memory = 2147483648.B }
withLabel: mem4gib { memory = 4294967296.B }
withLabel: mem8gib { memory = 8589934592.B }
withLabel: mem16gib { memory = 17179869184.B }
withLabel: mem32gib { memory = 34359738368.B }
withLabel: mem64gib { memory = 68719476736.B }
withLabel: mem128gib { memory = 137438953472.B }
withLabel: mem256gib { memory = 274877906944.B }
withLabel: mem512gib { memory = 549755813888.B }
withLabel: mem1tib { memory = 1099511627776.B }
withLabel: mem2tib { memory = 2199023255552.B }
withLabel: mem4tib { memory = 4398046511104.B }
withLabel: mem8tib { memory = 8796093022208.B }
withLabel: mem16tib { memory = 17592186044416.B }
withLabel: mem32tib { memory = 35184372088832.B }
withLabel: mem64tib { memory = 70368744177664.B }
withLabel: mem128tib { memory = 140737488355328.B }
withLabel: mem256tib { memory = 281474976710656.B }
withLabel: mem512tib { memory = 562949953421312.B }
withLabel: cpu1 { cpus = 1 }
withLabel: cpu2 { cpus = 2 }
withLabel: cpu5 { cpus = 5 }
withLabel: cpu10 { cpus = 10 }
withLabel: cpu20 { cpus = 20 }
withLabel: cpu50 { cpus = 50 }
withLabel: cpu100 { cpus = 100 }
withLabel: cpu200 { cpus = 200 }
withLabel: cpu500 { cpus = 500 }
withLabel: cpu1000 { cpus = 1000 }
}
includeConfig("nextflow_labels.config")

View File

@@ -0,0 +1,48 @@
process {
// Default resources for components that hardly do any processing
memory = { 2.GB * task.attempt }
cpus = 1
// Retry for exit codes that have something to do with memory issues
errorStrategy = { task.exitStatus in 137..140 ? 'retry' : 'terminate' }
maxRetries = 3
// The memory a task is assinged increases with each attempt
// uncomment the line below and adjust the value to set a global upper limit on the memory.
// resourceLimits = [ memory: 240.Gb ]
// CPU resources
withLabel: singlecpu { cpus = 1 }
withLabel: lowcpu { cpus = 4 }
withLabel: midcpu { cpus = 10 }
withLabel: highcpu { cpus = 20 }
// Memory resources
withLabel: lowmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 4.GB * task.attempt } }
withLabel: midmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 25.GB * task.attempt } }
withLabel: highmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 50.GB * task.attempt } }
withLabel: veryhighmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 75.GB * task.attempt } }
// Disk space
withLabel: lowdisk {
disk = {process.disk ? process.disk : null}
}
withLabel: middisk {
disk = {process.disk ? process.disk : null}
}
withLabel: highdisk {
disk = {process.disk ? process.disk : null}
}
withLabel: veryhighdisk {
disk = {process.disk ? process.disk : null}
}
// NOTE: The above labels intentionally do not have an effect by default.
// The user should set the disk space requirements by adding the following
// to the compute environment:
//
// withLabel: lowdisk { disk = { 20.GB * task.attempt } }
// withLabel: middisk { disk = { 100.GB * task.attempt } }
// withLabel: highdisk { disk = { 200.GB * task.attempt } }
// withLabel: veryhighdisk { disk = { 500.GB * task.attempt } }
}

View File

@@ -0,0 +1,8 @@
# Arguments
input: # please fill in - example: ["sample_paths"]
# output: "output.h5mu"
# output_compression: "gzip"
# Nextflow input-output arguments
publish_dir: # please fill in - example: "output/"
# param_list: "my_params.yaml"

View File

@@ -0,0 +1,78 @@
{
"$schema": "https://json-schema.org/draft/2020-12/schema",
"title": "merge",
"description": "Combine one or more single-modality .h5mu files together into one .h5mu file.\n",
"type": "object",
"$defs": {
"Dataset input": {
"title": "Dataset input",
"type": "object",
"description": "Dataset input using nf-tower \"dataset\" or \"data explorer\". Allows for the input of multiple parameter sets to initialise a Nextflow channel.",
"properties": {
"param_list": {
"description": "Dataset input can either be a list of maps, a csv file, a json file, a yaml file, or simply a yaml blob. The names of the input fields (e.g. csv columns, json keys) need to be an exact match with the workflow input parameters.",
"type": "string",
"default": "",
"format": "file-path",
"mimetype": "text/csv"
}
}
},
"arguments": {
"title": "Arguments",
"type": "object",
"description": "No description",
"properties": {
"input": {
"type": "array",
"items": {
"type": "string"
},
"format": "path",
"exists": true,
"description": "Paths to the single-modality .h5mu files that need to be combined",
"help_text": "Type: `file`, multiple: `True`, required, default: `[\"sample_paths\"]`, direction: `input`. ",
"default": [
"sample_paths"
]
},
"output": {
"type": "string",
"format": "path",
"description": "Path to the output file.",
"help_text": "Type: `file`, multiple: `False`, default: `\"output.h5mu\"`, direction: `output`. ",
"default": "output.h5mu"
},
"output_compression": {
"type": "string",
"description": "The compression format to be used on the output h5mu object.",
"help_text": "Type: `string`, multiple: `False`, example: `\"gzip\"`, choices: ``gzip`, `lzf``. ",
"enum": [
"gzip",
"lzf"
]
}
}
},
"nextflow input-output arguments": {
"title": "Nextflow input-output arguments",
"type": "object",
"description": "Input/output parameters for Nextflow itself. Please note that both publishDir and publish_dir are supported but at least one has to be configured.",
"properties": {
"publish_dir": {
"type": "string",
"description": "Path to an output directory.",
"help_text": "Type: `string`, multiple: `False`, required, example: `\"output/\"`. "
}
}
}
},
"allOf": [
{
"$ref": "#/$defs/arguments"
},
{
"$ref": "#/$defs/nextflow input-output arguments"
}
]
}

View File

@@ -0,0 +1,12 @@
def setup_logger():
import logging
from sys import stdout
logger = logging.getLogger()
logger.setLevel(logging.INFO)
console_handler = logging.StreamHandler(stdout)
logFormatter = logging.Formatter("%(asctime)s %(levelname)-8s %(message)s")
console_handler.setFormatter(logFormatter)
logger.addHandler(console_handler)
return logger

View File

@@ -0,0 +1,273 @@
name: "split_modalities"
namespace: "dataflow"
version: "3.0.0"
authors:
- name: "Dries Schaumont"
roles:
- "maintainer"
info:
role: "Core Team Member"
links:
email: "dries@data-intuitive.com"
github: "DriesSchaumont"
orcid: "0000-0002-4389-0440"
linkedin: "dries-schaumont"
organizations:
- name: "Data Intuitive"
href: "https://www.data-intuitive.com"
role: "Data Scientist"
- name: "Robrecht Cannoodt"
roles:
- "contributor"
info:
role: "Core Team Member"
links:
email: "robrecht@data-intuitive.com"
github: "rcannood"
orcid: "0000-0003-3641-729X"
linkedin: "robrechtcannoodt"
organizations:
- name: "Data Intuitive"
href: "https://www.data-intuitive.com"
role: "Data Science Engineer"
- name: "Open Problems"
href: "https://openproblems.bio"
role: "Core Member"
argument_groups:
- name: "Arguments"
arguments:
- type: "file"
name: "--input"
alternatives:
- "-i"
description: "Path to a single .h5mu file."
info: null
default:
- "sample_path"
must_exist: true
create_parent: true
required: true
direction: "input"
multiple: false
multiple_sep: ";"
- type: "file"
name: "--output"
alternatives:
- "-o"
description: "Output directory containing multiple h5mu files."
info: null
example:
- "/path/to/output"
must_exist: true
create_parent: true
required: true
direction: "output"
multiple: false
multiple_sep: ";"
- type: "file"
name: "--output_types"
description: "A csv containing the base filename and modality type per output\
\ file."
info: null
example:
- "types.csv"
must_exist: true
create_parent: true
required: true
direction: "output"
multiple: false
multiple_sep: ";"
- type: "string"
name: "--output_compression"
description: "Compression format to use for the output AnnData and/or Mudata objects.\n\
By default no compression is applied.\n"
info: null
example:
- "gzip"
required: false
choices:
- "gzip"
- "lzf"
direction: "input"
multiple: false
multiple_sep: ";"
resources:
- type: "python_script"
path: "script.py"
is_executable: true
- type: "file"
path: "setup_logger.py"
- type: "file"
path: "nextflow_labels.config"
dest: "nextflow_labels.config"
description: "Split the modalities from a single .h5mu multimodal sample into seperate\
\ .h5mu files. \n"
test_resources:
- type: "python_script"
path: "test.py"
is_executable: true
info: null
status: "enabled"
scope:
image: "public"
target: "public"
license: "MIT"
links:
repository: "https://github.com/openpipelines-bio/openpipeline"
docker_registry: "ghcr.io"
runners:
- type: "executable"
id: "executable"
docker_setup_strategy: "ifneedbepullelsecachedbuild"
- type: "nextflow"
id: "nextflow"
directives:
label:
- "singlecpu"
- "lowmem"
tag: "$id"
auto:
simplifyInput: true
simplifyOutput: false
transcript: false
publish: false
config:
labels:
mem1gb: "memory = 1000000000.B"
mem2gb: "memory = 2000000000.B"
mem5gb: "memory = 5000000000.B"
mem10gb: "memory = 10000000000.B"
mem20gb: "memory = 20000000000.B"
mem50gb: "memory = 50000000000.B"
mem100gb: "memory = 100000000000.B"
mem200gb: "memory = 200000000000.B"
mem500gb: "memory = 500000000000.B"
mem1tb: "memory = 1000000000000.B"
mem2tb: "memory = 2000000000000.B"
mem5tb: "memory = 5000000000000.B"
mem10tb: "memory = 10000000000000.B"
mem20tb: "memory = 20000000000000.B"
mem50tb: "memory = 50000000000000.B"
mem100tb: "memory = 100000000000000.B"
mem200tb: "memory = 200000000000000.B"
mem500tb: "memory = 500000000000000.B"
mem1gib: "memory = 1073741824.B"
mem2gib: "memory = 2147483648.B"
mem4gib: "memory = 4294967296.B"
mem8gib: "memory = 8589934592.B"
mem16gib: "memory = 17179869184.B"
mem32gib: "memory = 34359738368.B"
mem64gib: "memory = 68719476736.B"
mem128gib: "memory = 137438953472.B"
mem256gib: "memory = 274877906944.B"
mem512gib: "memory = 549755813888.B"
mem1tib: "memory = 1099511627776.B"
mem2tib: "memory = 2199023255552.B"
mem4tib: "memory = 4398046511104.B"
mem8tib: "memory = 8796093022208.B"
mem16tib: "memory = 17592186044416.B"
mem32tib: "memory = 35184372088832.B"
mem64tib: "memory = 70368744177664.B"
mem128tib: "memory = 140737488355328.B"
mem256tib: "memory = 281474976710656.B"
mem512tib: "memory = 562949953421312.B"
cpu1: "cpus = 1"
cpu2: "cpus = 2"
cpu5: "cpus = 5"
cpu10: "cpus = 10"
cpu20: "cpus = 20"
cpu50: "cpus = 50"
cpu100: "cpus = 100"
cpu200: "cpus = 200"
cpu500: "cpus = 500"
cpu1000: "cpus = 1000"
script:
- "includeConfig(\"nextflow_labels.config\")"
debug: false
container: "docker"
engines:
- type: "docker"
id: "docker"
image: "python:3.12-slim"
target_tag: "3.0.0"
namespace_separator: "/"
setup:
- type: "apt"
packages:
- "procps"
interactive: false
- type: "python"
user: false
packages:
- "anndata~=0.11.1"
- "mudata~=0.3.1"
script:
- "exec(\"try:\\n import awkward\\nexcept ModuleNotFoundError:\\n exit(0)\\\
nelse: exit(1)\")"
upgrade: true
test_setup:
- type: "apt"
packages:
- "git"
interactive: false
- type: "python"
user: false
packages:
- "viashpy==0.8.0"
github:
- "openpipelines-bio/core#subdirectory=packages/python/openpipeline_testutils"
upgrade: true
entrypoint: []
cmd: null
build_info:
config: "src/dataflow/split_modalities/config.vsh.yaml"
runner: "nextflow"
engine: "docker"
output: "target/nextflow/dataflow/split_modalities"
executable: "target/nextflow/dataflow/split_modalities/main.nf"
viash_version: "0.9.4"
git_commit: "706b5ce24d313dcf947b7d9fe929630f1ad204e7"
git_remote: "https://github.com/openpipelines-bio/openpipeline"
git_tag: "2.1.0-2-g706b5ce24d3"
package_config:
name: "openpipeline"
summary: "Best-practice workflows for single-cell multi-omics analyses.\n"
description: "OpenPipelines are extensible single cell analysis pipelines for reproducible\
\ and large-scale single cell processing using [Viash](https://viash.io) and [Nextflow](https://www.nextflow.io/).\n\
\nIn terms of workflows, the following has been made available, but keep in mind\
\ that\nindividual tools and functionality can be executed as standalone components\
\ as well.\n\n * Demultiplexing: conversion of raw sequencing data to FASTQ objects.\n\
\ * Ingestion: Read mapping and generating a count matrix.\n * Single sample\
\ processing: cell filtering and doublet detection.\n * Multisample processing:\
\ Count transformation, normalization, QC metric calulations.\n * Integration:\
\ Clustering, integration and batch correction using single and multimodal methods.\n\
\ * Downstream analysis workflows\n"
info:
test_resources:
- type: "s3"
path: "s3://openpipelines-data"
dest: "resources_test"
nextflow_labels_ci:
- path: "src/workflows/utils/labels_ci.config"
description: "Adds the correct memory and CPU labels when running on the Viash\
\ Hub CI."
viash_version: "0.9.4"
source: "src"
target: "target"
config_mods:
- ".resources += {path: '/src/workflows/utils/labels.config', dest: 'nextflow_labels.config'}\n\
.runners[.type == 'nextflow'].config.script := 'includeConfig(\"nextflow_labels.config\"\
)'"
- ".version := \"3.0.0\""
- ".engines[.type == 'docker'].target_tag := '3.0.0'"
keywords:
- "single-cell"
- "multimodal"
license: "MIT"
organization: "openpipelines-bio"
links:
repository: "https://github.com/openpipelines-bio/openpipeline"
docker_registry: "ghcr.io"
homepage: "https://openpipelines.bio"
documentation: "https://openpipelines.bio/fundamentals"
issue_tracker: "https://github.com/openpipelines-bio/openpipeline/issues"

View File

@@ -0,0 +1,126 @@
manifest {
name = 'dataflow/split_modalities'
mainScript = 'main.nf'
nextflowVersion = '!>=20.12.1-edge'
version = '3.0.0'
description = 'Split the modalities from a single .h5mu multimodal sample into seperate .h5mu files. \n'
author = 'Dries Schaumont, Robrecht Cannoodt'
}
process.container = 'nextflow/bash:latest'
// detect tempdir
tempDir = java.nio.file.Paths.get(
System.getenv('NXF_TEMP') ?:
System.getenv('VIASH_TEMP') ?:
System.getenv('TEMPDIR') ?:
System.getenv('TMPDIR') ?:
'/tmp'
).toAbsolutePath()
profiles {
no_publish {
process {
withName: '.*' {
publishDir = [
enabled: false
]
}
}
}
mount_temp {
docker.temp = tempDir
podman.temp = tempDir
charliecloud.temp = tempDir
}
docker {
docker.enabled = true
// docker.userEmulation = true
singularity.enabled = false
podman.enabled = false
shifter.enabled = false
charliecloud.enabled = false
}
singularity {
singularity.enabled = true
singularity.autoMounts = true
docker.enabled = false
podman.enabled = false
shifter.enabled = false
charliecloud.enabled = false
}
podman {
podman.enabled = true
docker.enabled = false
singularity.enabled = false
shifter.enabled = false
charliecloud.enabled = false
}
shifter {
shifter.enabled = true
docker.enabled = false
singularity.enabled = false
podman.enabled = false
charliecloud.enabled = false
}
charliecloud {
charliecloud.enabled = true
docker.enabled = false
singularity.enabled = false
podman.enabled = false
shifter.enabled = false
}
}
process{
withLabel: mem1gb { memory = 1000000000.B }
withLabel: mem2gb { memory = 2000000000.B }
withLabel: mem5gb { memory = 5000000000.B }
withLabel: mem10gb { memory = 10000000000.B }
withLabel: mem20gb { memory = 20000000000.B }
withLabel: mem50gb { memory = 50000000000.B }
withLabel: mem100gb { memory = 100000000000.B }
withLabel: mem200gb { memory = 200000000000.B }
withLabel: mem500gb { memory = 500000000000.B }
withLabel: mem1tb { memory = 1000000000000.B }
withLabel: mem2tb { memory = 2000000000000.B }
withLabel: mem5tb { memory = 5000000000000.B }
withLabel: mem10tb { memory = 10000000000000.B }
withLabel: mem20tb { memory = 20000000000000.B }
withLabel: mem50tb { memory = 50000000000000.B }
withLabel: mem100tb { memory = 100000000000000.B }
withLabel: mem200tb { memory = 200000000000000.B }
withLabel: mem500tb { memory = 500000000000000.B }
withLabel: mem1gib { memory = 1073741824.B }
withLabel: mem2gib { memory = 2147483648.B }
withLabel: mem4gib { memory = 4294967296.B }
withLabel: mem8gib { memory = 8589934592.B }
withLabel: mem16gib { memory = 17179869184.B }
withLabel: mem32gib { memory = 34359738368.B }
withLabel: mem64gib { memory = 68719476736.B }
withLabel: mem128gib { memory = 137438953472.B }
withLabel: mem256gib { memory = 274877906944.B }
withLabel: mem512gib { memory = 549755813888.B }
withLabel: mem1tib { memory = 1099511627776.B }
withLabel: mem2tib { memory = 2199023255552.B }
withLabel: mem4tib { memory = 4398046511104.B }
withLabel: mem8tib { memory = 8796093022208.B }
withLabel: mem16tib { memory = 17592186044416.B }
withLabel: mem32tib { memory = 35184372088832.B }
withLabel: mem64tib { memory = 70368744177664.B }
withLabel: mem128tib { memory = 140737488355328.B }
withLabel: mem256tib { memory = 281474976710656.B }
withLabel: mem512tib { memory = 562949953421312.B }
withLabel: cpu1 { cpus = 1 }
withLabel: cpu2 { cpus = 2 }
withLabel: cpu5 { cpus = 5 }
withLabel: cpu10 { cpus = 10 }
withLabel: cpu20 { cpus = 20 }
withLabel: cpu50 { cpus = 50 }
withLabel: cpu100 { cpus = 100 }
withLabel: cpu200 { cpus = 200 }
withLabel: cpu500 { cpus = 500 }
withLabel: cpu1000 { cpus = 1000 }
}
includeConfig("nextflow_labels.config")

View File

@@ -0,0 +1,48 @@
process {
// Default resources for components that hardly do any processing
memory = { 2.GB * task.attempt }
cpus = 1
// Retry for exit codes that have something to do with memory issues
errorStrategy = { task.exitStatus in 137..140 ? 'retry' : 'terminate' }
maxRetries = 3
// The memory a task is assinged increases with each attempt
// uncomment the line below and adjust the value to set a global upper limit on the memory.
// resourceLimits = [ memory: 240.Gb ]
// CPU resources
withLabel: singlecpu { cpus = 1 }
withLabel: lowcpu { cpus = 4 }
withLabel: midcpu { cpus = 10 }
withLabel: highcpu { cpus = 20 }
// Memory resources
withLabel: lowmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 4.GB * task.attempt } }
withLabel: midmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 25.GB * task.attempt } }
withLabel: highmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 50.GB * task.attempt } }
withLabel: veryhighmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 75.GB * task.attempt } }
// Disk space
withLabel: lowdisk {
disk = {process.disk ? process.disk : null}
}
withLabel: middisk {
disk = {process.disk ? process.disk : null}
}
withLabel: highdisk {
disk = {process.disk ? process.disk : null}
}
withLabel: veryhighdisk {
disk = {process.disk ? process.disk : null}
}
// NOTE: The above labels intentionally do not have an effect by default.
// The user should set the disk space requirements by adding the following
// to the compute environment:
//
// withLabel: lowdisk { disk = { 20.GB * task.attempt } }
// withLabel: middisk { disk = { 100.GB * task.attempt } }
// withLabel: highdisk { disk = { 200.GB * task.attempt } }
// withLabel: veryhighdisk { disk = { 500.GB * task.attempt } }
}

View File

@@ -0,0 +1,9 @@
# Arguments
input: # please fill in - example: "sample_path"
# output: "$id.$key.output"
# output_types: "$id.$key.output_types.csv"
# output_compression: "gzip"
# Nextflow input-output arguments
publish_dir: # please fill in - example: "output/"
# param_list: "my_params.yaml"

View File

@@ -0,0 +1,80 @@
{
"$schema": "https://json-schema.org/draft/2020-12/schema",
"title": "split_modalities",
"description": "Split the modalities from a single .h5mu multimodal sample into seperate .h5mu files. \n",
"type": "object",
"$defs": {
"Dataset input": {
"title": "Dataset input",
"type": "object",
"description": "Dataset input using nf-tower \"dataset\" or \"data explorer\". Allows for the input of multiple parameter sets to initialise a Nextflow channel.",
"properties": {
"param_list": {
"description": "Dataset input can either be a list of maps, a csv file, a json file, a yaml file, or simply a yaml blob. The names of the input fields (e.g. csv columns, json keys) need to be an exact match with the workflow input parameters.",
"type": "string",
"default": "",
"format": "file-path",
"mimetype": "text/csv"
}
}
},
"arguments": {
"title": "Arguments",
"type": "object",
"description": "No description",
"properties": {
"input": {
"type": "string",
"format": "path",
"exists": true,
"description": "Path to a single .h5mu file.",
"help_text": "Type: `file`, multiple: `False`, required, default: `\"sample_path\"`, direction: `input`. ",
"default": "sample_path"
},
"output": {
"type": "string",
"format": "path",
"description": "Output directory containing multiple h5mu files.",
"help_text": "Type: `file`, multiple: `False`, required, default: `\"$id.$key.output\"`, direction: `output`, example: `\"/path/to/output\"`. ",
"default": "$id.$key.output"
},
"output_types": {
"type": "string",
"format": "path",
"description": "A csv containing the base filename and modality type per output file.",
"help_text": "Type: `file`, multiple: `False`, required, default: `\"$id.$key.output_types.csv\"`, direction: `output`, example: `\"types.csv\"`. ",
"default": "$id.$key.output_types.csv"
},
"output_compression": {
"type": "string",
"description": "Compression format to use for the output AnnData and/or Mudata objects.\nBy default no compression is applied.\n",
"help_text": "Type: `string`, multiple: `False`, example: `\"gzip\"`, choices: ``gzip`, `lzf``. ",
"enum": [
"gzip",
"lzf"
]
}
}
},
"nextflow input-output arguments": {
"title": "Nextflow input-output arguments",
"type": "object",
"description": "Input/output parameters for Nextflow itself. Please note that both publishDir and publish_dir are supported but at least one has to be configured.",
"properties": {
"publish_dir": {
"type": "string",
"description": "Path to an output directory.",
"help_text": "Type: `string`, multiple: `False`, required, example: `\"output/\"`. "
}
}
}
},
"allOf": [
{
"$ref": "#/$defs/arguments"
},
{
"$ref": "#/$defs/nextflow input-output arguments"
}
]
}

View File

@@ -0,0 +1,12 @@
def setup_logger():
import logging
from sys import stdout
logger = logging.getLogger()
logger.setLevel(logging.INFO)
console_handler = logging.StreamHandler(stdout)
logFormatter = logging.Formatter("%(asctime)s %(levelname)-8s %(message)s")
console_handler.setFormatter(logFormatter)
logger.addHandler(console_handler)
return logger

View File

@@ -0,0 +1,318 @@
name: "pca"
namespace: "dimred"
version: "3.0.0"
authors:
- name: "Dries De Maeyer"
roles:
- "maintainer"
info:
role: "Core Team Member"
links:
email: "ddemaeyer@gmail.com"
github: "ddemaeyer"
linkedin: "dries-de-maeyer-b46a814"
organizations:
- name: "Janssen Pharmaceuticals"
href: "https://www.janssen.com"
role: "Principal Scientist"
argument_groups:
- name: "Arguments"
arguments:
- type: "file"
name: "--input"
alternatives:
- "-i"
description: "Input h5mu file"
info: null
example:
- "input.h5mu"
must_exist: true
create_parent: true
required: true
direction: "input"
multiple: false
multiple_sep: ";"
- type: "string"
name: "--modality"
description: "Which modality from the input MuData file to process.\n"
info: null
default:
- "rna"
required: false
direction: "input"
multiple: false
multiple_sep: ";"
- type: "string"
name: "--layer"
description: "Use specified layer for expression values instead of the .X object\
\ from the modality."
info: null
required: false
direction: "input"
multiple: false
multiple_sep: ";"
- type: "string"
name: "--var_input"
description: "Column name in .var matrix that will be used to select which genes\
\ to run the PCA on."
info: null
example:
- "filter_with_hvg"
required: false
direction: "input"
multiple: false
multiple_sep: ";"
- type: "file"
name: "--output"
alternatives:
- "-o"
description: "Output h5mu file."
info: null
example:
- "output.h5mu"
must_exist: true
create_parent: true
required: true
direction: "output"
multiple: false
multiple_sep: ";"
- type: "string"
name: "--obsm_output"
description: "In which .obsm slot to store the resulting embedding."
info: null
default:
- "X_pca"
required: false
direction: "input"
multiple: false
multiple_sep: ";"
- type: "string"
name: "--varm_output"
description: "In which .varm slot to store the resulting loadings matrix."
info: null
default:
- "pca_loadings"
required: false
direction: "input"
multiple: false
multiple_sep: ";"
- type: "string"
name: "--uns_output"
description: "In which .uns slot to store the resulting variance objects."
info: null
default:
- "pca_variance"
required: false
direction: "input"
multiple: false
multiple_sep: ";"
- type: "integer"
name: "--num_components"
description: "Number of principal components to compute. Defaults to 50, or 1\
\ - minimum dimension size of selected representation."
info: null
example:
- 25
required: false
direction: "input"
multiple: false
multiple_sep: ";"
- type: "boolean_true"
name: "--overwrite"
description: "Allow overwriting .obsm, .varm and .uns slots."
info: null
direction: "input"
- type: "string"
name: "--output_compression"
description: "Compression format to use for the output AnnData and/or Mudata objects.\n\
By default no compression is applied.\n"
info: null
example:
- "gzip"
required: false
choices:
- "gzip"
- "lzf"
direction: "input"
multiple: false
multiple_sep: ";"
resources:
- type: "python_script"
path: "script.py"
is_executable: true
- type: "file"
path: "setup_logger.py"
- type: "file"
path: "compress_h5mu.py"
- type: "file"
path: "nextflow_labels.config"
dest: "nextflow_labels.config"
description: "Computes PCA coordinates, loadings and variance decomposition. Uses\
\ the implementation of scikit-learn [Pedregosa11].\n"
test_resources:
- type: "python_script"
path: "test.py"
is_executable: true
- type: "file"
path: "pbmc_1k_protein_v3"
info: null
status: "enabled"
scope:
image: "public"
target: "public"
license: "MIT"
links:
repository: "https://github.com/openpipelines-bio/openpipeline"
docker_registry: "ghcr.io"
runners:
- type: "executable"
id: "executable"
docker_setup_strategy: "ifneedbepullelsecachedbuild"
- type: "nextflow"
id: "nextflow"
directives:
label:
- "highcpu"
- "highmem"
- "middisk"
tag: "$id"
auto:
simplifyInput: true
simplifyOutput: false
transcript: false
publish: false
config:
labels:
mem1gb: "memory = 1000000000.B"
mem2gb: "memory = 2000000000.B"
mem5gb: "memory = 5000000000.B"
mem10gb: "memory = 10000000000.B"
mem20gb: "memory = 20000000000.B"
mem50gb: "memory = 50000000000.B"
mem100gb: "memory = 100000000000.B"
mem200gb: "memory = 200000000000.B"
mem500gb: "memory = 500000000000.B"
mem1tb: "memory = 1000000000000.B"
mem2tb: "memory = 2000000000000.B"
mem5tb: "memory = 5000000000000.B"
mem10tb: "memory = 10000000000000.B"
mem20tb: "memory = 20000000000000.B"
mem50tb: "memory = 50000000000000.B"
mem100tb: "memory = 100000000000000.B"
mem200tb: "memory = 200000000000000.B"
mem500tb: "memory = 500000000000000.B"
mem1gib: "memory = 1073741824.B"
mem2gib: "memory = 2147483648.B"
mem4gib: "memory = 4294967296.B"
mem8gib: "memory = 8589934592.B"
mem16gib: "memory = 17179869184.B"
mem32gib: "memory = 34359738368.B"
mem64gib: "memory = 68719476736.B"
mem128gib: "memory = 137438953472.B"
mem256gib: "memory = 274877906944.B"
mem512gib: "memory = 549755813888.B"
mem1tib: "memory = 1099511627776.B"
mem2tib: "memory = 2199023255552.B"
mem4tib: "memory = 4398046511104.B"
mem8tib: "memory = 8796093022208.B"
mem16tib: "memory = 17592186044416.B"
mem32tib: "memory = 35184372088832.B"
mem64tib: "memory = 70368744177664.B"
mem128tib: "memory = 140737488355328.B"
mem256tib: "memory = 281474976710656.B"
mem512tib: "memory = 562949953421312.B"
cpu1: "cpus = 1"
cpu2: "cpus = 2"
cpu5: "cpus = 5"
cpu10: "cpus = 10"
cpu20: "cpus = 20"
cpu50: "cpus = 50"
cpu100: "cpus = 100"
cpu200: "cpus = 200"
cpu500: "cpus = 500"
cpu1000: "cpus = 1000"
script:
- "includeConfig(\"nextflow_labels.config\")"
debug: false
container: "docker"
engines:
- type: "docker"
id: "docker"
image: "python:3.12-slim"
target_tag: "3.0.0"
namespace_separator: "/"
setup:
- type: "apt"
packages:
- "procps"
interactive: false
- type: "python"
user: false
packages:
- "anndata~=0.11.1"
- "mudata~=0.3.1"
- "scanpy~=1.10.4"
script:
- "exec(\"try:\\n import awkward\\nexcept ModuleNotFoundError:\\n exit(0)\\\
nelse: exit(1)\")"
upgrade: true
test_setup:
- type: "python"
user: false
packages:
- "viashpy==0.8.0"
upgrade: true
entrypoint: []
cmd: null
build_info:
config: "src/dimred/pca/config.vsh.yaml"
runner: "nextflow"
engine: "docker"
output: "target/nextflow/dimred/pca"
executable: "target/nextflow/dimred/pca/main.nf"
viash_version: "0.9.4"
git_commit: "706b5ce24d313dcf947b7d9fe929630f1ad204e7"
git_remote: "https://github.com/openpipelines-bio/openpipeline"
git_tag: "2.1.0-2-g706b5ce24d3"
package_config:
name: "openpipeline"
summary: "Best-practice workflows for single-cell multi-omics analyses.\n"
description: "OpenPipelines are extensible single cell analysis pipelines for reproducible\
\ and large-scale single cell processing using [Viash](https://viash.io) and [Nextflow](https://www.nextflow.io/).\n\
\nIn terms of workflows, the following has been made available, but keep in mind\
\ that\nindividual tools and functionality can be executed as standalone components\
\ as well.\n\n * Demultiplexing: conversion of raw sequencing data to FASTQ objects.\n\
\ * Ingestion: Read mapping and generating a count matrix.\n * Single sample\
\ processing: cell filtering and doublet detection.\n * Multisample processing:\
\ Count transformation, normalization, QC metric calulations.\n * Integration:\
\ Clustering, integration and batch correction using single and multimodal methods.\n\
\ * Downstream analysis workflows\n"
info:
test_resources:
- type: "s3"
path: "s3://openpipelines-data"
dest: "resources_test"
nextflow_labels_ci:
- path: "src/workflows/utils/labels_ci.config"
description: "Adds the correct memory and CPU labels when running on the Viash\
\ Hub CI."
viash_version: "0.9.4"
source: "src"
target: "target"
config_mods:
- ".resources += {path: '/src/workflows/utils/labels.config', dest: 'nextflow_labels.config'}\n\
.runners[.type == 'nextflow'].config.script := 'includeConfig(\"nextflow_labels.config\"\
)'"
- ".version := \"3.0.0\""
- ".engines[.type == 'docker'].target_tag := '3.0.0'"
keywords:
- "single-cell"
- "multimodal"
license: "MIT"
organization: "openpipelines-bio"
links:
repository: "https://github.com/openpipelines-bio/openpipeline"
docker_registry: "ghcr.io"
homepage: "https://openpipelines.bio"
documentation: "https://openpipelines.bio/fundamentals"
issue_tracker: "https://github.com/openpipelines-bio/openpipeline/issues"

View File

@@ -0,0 +1,87 @@
import shutil
from anndata import AnnData
from mudata import write_h5ad
from h5py import File as H5File
from h5py import Group, Dataset
from pathlib import Path
from typing import Union, Literal
from functools import partial
def compress_h5mu(
input_path: Union[str, Path],
output_path: Union[str, Path],
compression: Union[Literal["gzip"], Literal["lzf"]],
):
input_path, output_path = str(input_path), str(output_path)
def copy_attributes(in_object, out_object):
for key, value in in_object.attrs.items():
out_object.attrs[key] = value
def visit_path(
output_h5: H5File,
compression: Union[Literal["gzip"], Literal["lzf"]],
name: str,
object: Union[Group, Dataset],
):
if isinstance(object, Group):
new_group = output_h5.create_group(name)
copy_attributes(object, new_group)
elif isinstance(object, Dataset):
# Compression only works for non-scalar Dataset objects
# Scalar objects dont have a shape defined
if not object.compression and object.shape not in [None, ()]:
new_dataset = output_h5.create_dataset(
name, data=object, compression=compression
)
copy_attributes(object, new_dataset)
else:
output_h5.copy(object, name)
else:
raise NotImplementedError(
f"Could not copy element {name}, "
f"type has not been implemented yet: {type(object)}"
)
with (
H5File(input_path, "r") as input_h5,
H5File(output_path, "w", userblock_size=512) as output_h5,
):
copy_attributes(input_h5, output_h5)
input_h5.visititems(partial(visit_path, output_h5, compression))
with open(input_path, "rb") as input_bytes:
# Mudata puts metadata like this in the first 512 bytes:
# MuData (format-version=0.1.0;creator=muon;creator-version=0.2.0)
# See mudata/_core/io.py, read_h5mu() function
starting_metadata = input_bytes.read(100)
# The metadata is padded with extra null bytes up until 512 bytes
truncate_location = starting_metadata.find(b"\x00")
starting_metadata = starting_metadata[:truncate_location]
with open(output_path, "br+") as f:
nbytes = f.write(starting_metadata)
f.write(b"\0" * (512 - nbytes))
def write_h5ad_to_h5mu_with_compression(
output_file: Union[str, Path],
h5mu: Union[str, Path],
modality_name: str,
modality_data: AnnData,
output_compression=None,
):
output_file = Path(output_file)
h5mu = Path(h5mu)
output_file_uncompressed = (
output_file.with_name(output_file.stem + "_uncompressed.h5mu")
if output_compression
else output_file
)
shutil.copyfile(h5mu, output_file_uncompressed)
write_h5ad(filename=output_file_uncompressed, mod=modality_name, data=modality_data)
if output_compression:
compress_h5mu(
output_file_uncompressed, output_file, compression=output_compression
)
output_file_uncompressed.unlink()

View File

@@ -0,0 +1,126 @@
manifest {
name = 'dimred/pca'
mainScript = 'main.nf'
nextflowVersion = '!>=20.12.1-edge'
version = '3.0.0'
description = 'Computes PCA coordinates, loadings and variance decomposition. Uses the implementation of scikit-learn [Pedregosa11].\n'
author = 'Dries De Maeyer'
}
process.container = 'nextflow/bash:latest'
// detect tempdir
tempDir = java.nio.file.Paths.get(
System.getenv('NXF_TEMP') ?:
System.getenv('VIASH_TEMP') ?:
System.getenv('TEMPDIR') ?:
System.getenv('TMPDIR') ?:
'/tmp'
).toAbsolutePath()
profiles {
no_publish {
process {
withName: '.*' {
publishDir = [
enabled: false
]
}
}
}
mount_temp {
docker.temp = tempDir
podman.temp = tempDir
charliecloud.temp = tempDir
}
docker {
docker.enabled = true
// docker.userEmulation = true
singularity.enabled = false
podman.enabled = false
shifter.enabled = false
charliecloud.enabled = false
}
singularity {
singularity.enabled = true
singularity.autoMounts = true
docker.enabled = false
podman.enabled = false
shifter.enabled = false
charliecloud.enabled = false
}
podman {
podman.enabled = true
docker.enabled = false
singularity.enabled = false
shifter.enabled = false
charliecloud.enabled = false
}
shifter {
shifter.enabled = true
docker.enabled = false
singularity.enabled = false
podman.enabled = false
charliecloud.enabled = false
}
charliecloud {
charliecloud.enabled = true
docker.enabled = false
singularity.enabled = false
podman.enabled = false
shifter.enabled = false
}
}
process{
withLabel: mem1gb { memory = 1000000000.B }
withLabel: mem2gb { memory = 2000000000.B }
withLabel: mem5gb { memory = 5000000000.B }
withLabel: mem10gb { memory = 10000000000.B }
withLabel: mem20gb { memory = 20000000000.B }
withLabel: mem50gb { memory = 50000000000.B }
withLabel: mem100gb { memory = 100000000000.B }
withLabel: mem200gb { memory = 200000000000.B }
withLabel: mem500gb { memory = 500000000000.B }
withLabel: mem1tb { memory = 1000000000000.B }
withLabel: mem2tb { memory = 2000000000000.B }
withLabel: mem5tb { memory = 5000000000000.B }
withLabel: mem10tb { memory = 10000000000000.B }
withLabel: mem20tb { memory = 20000000000000.B }
withLabel: mem50tb { memory = 50000000000000.B }
withLabel: mem100tb { memory = 100000000000000.B }
withLabel: mem200tb { memory = 200000000000000.B }
withLabel: mem500tb { memory = 500000000000000.B }
withLabel: mem1gib { memory = 1073741824.B }
withLabel: mem2gib { memory = 2147483648.B }
withLabel: mem4gib { memory = 4294967296.B }
withLabel: mem8gib { memory = 8589934592.B }
withLabel: mem16gib { memory = 17179869184.B }
withLabel: mem32gib { memory = 34359738368.B }
withLabel: mem64gib { memory = 68719476736.B }
withLabel: mem128gib { memory = 137438953472.B }
withLabel: mem256gib { memory = 274877906944.B }
withLabel: mem512gib { memory = 549755813888.B }
withLabel: mem1tib { memory = 1099511627776.B }
withLabel: mem2tib { memory = 2199023255552.B }
withLabel: mem4tib { memory = 4398046511104.B }
withLabel: mem8tib { memory = 8796093022208.B }
withLabel: mem16tib { memory = 17592186044416.B }
withLabel: mem32tib { memory = 35184372088832.B }
withLabel: mem64tib { memory = 70368744177664.B }
withLabel: mem128tib { memory = 140737488355328.B }
withLabel: mem256tib { memory = 281474976710656.B }
withLabel: mem512tib { memory = 562949953421312.B }
withLabel: cpu1 { cpus = 1 }
withLabel: cpu2 { cpus = 2 }
withLabel: cpu5 { cpus = 5 }
withLabel: cpu10 { cpus = 10 }
withLabel: cpu20 { cpus = 20 }
withLabel: cpu50 { cpus = 50 }
withLabel: cpu100 { cpus = 100 }
withLabel: cpu200 { cpus = 200 }
withLabel: cpu500 { cpus = 500 }
withLabel: cpu1000 { cpus = 1000 }
}
includeConfig("nextflow_labels.config")

View File

@@ -0,0 +1,48 @@
process {
// Default resources for components that hardly do any processing
memory = { 2.GB * task.attempt }
cpus = 1
// Retry for exit codes that have something to do with memory issues
errorStrategy = { task.exitStatus in 137..140 ? 'retry' : 'terminate' }
maxRetries = 3
// The memory a task is assinged increases with each attempt
// uncomment the line below and adjust the value to set a global upper limit on the memory.
// resourceLimits = [ memory: 240.Gb ]
// CPU resources
withLabel: singlecpu { cpus = 1 }
withLabel: lowcpu { cpus = 4 }
withLabel: midcpu { cpus = 10 }
withLabel: highcpu { cpus = 20 }
// Memory resources
withLabel: lowmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 4.GB * task.attempt } }
withLabel: midmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 25.GB * task.attempt } }
withLabel: highmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 50.GB * task.attempt } }
withLabel: veryhighmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 75.GB * task.attempt } }
// Disk space
withLabel: lowdisk {
disk = {process.disk ? process.disk : null}
}
withLabel: middisk {
disk = {process.disk ? process.disk : null}
}
withLabel: highdisk {
disk = {process.disk ? process.disk : null}
}
withLabel: veryhighdisk {
disk = {process.disk ? process.disk : null}
}
// NOTE: The above labels intentionally do not have an effect by default.
// The user should set the disk space requirements by adding the following
// to the compute environment:
//
// withLabel: lowdisk { disk = { 20.GB * task.attempt } }
// withLabel: middisk { disk = { 100.GB * task.attempt } }
// withLabel: highdisk { disk = { 200.GB * task.attempt } }
// withLabel: veryhighdisk { disk = { 500.GB * task.attempt } }
}

View File

@@ -0,0 +1,16 @@
# Arguments
input: # please fill in - example: "input.h5mu"
modality: "rna"
# layer: "foo"
# var_input: "filter_with_hvg"
# output: "$id.$key.output.h5mu"
obsm_output: "X_pca"
varm_output: "pca_loadings"
uns_output: "pca_variance"
# num_components: 25
overwrite: false
# output_compression: "gzip"
# Nextflow input-output arguments
publish_dir: # please fill in - example: "output/"
# param_list: "my_params.yaml"

View File

@@ -0,0 +1,117 @@
{
"$schema": "https://json-schema.org/draft/2020-12/schema",
"title": "pca",
"description": "Computes PCA coordinates, loadings and variance decomposition. Uses the implementation of scikit-learn [Pedregosa11].\n",
"type": "object",
"$defs": {
"Dataset input": {
"title": "Dataset input",
"type": "object",
"description": "Dataset input using nf-tower \"dataset\" or \"data explorer\". Allows for the input of multiple parameter sets to initialise a Nextflow channel.",
"properties": {
"param_list": {
"description": "Dataset input can either be a list of maps, a csv file, a json file, a yaml file, or simply a yaml blob. The names of the input fields (e.g. csv columns, json keys) need to be an exact match with the workflow input parameters.",
"type": "string",
"default": "",
"format": "file-path",
"mimetype": "text/csv"
}
}
},
"arguments": {
"title": "Arguments",
"type": "object",
"description": "No description",
"properties": {
"input": {
"type": "string",
"format": "path",
"exists": true,
"description": "Input h5mu file",
"help_text": "Type: `file`, multiple: `False`, required, direction: `input`, example: `\"input.h5mu\"`. "
},
"modality": {
"type": "string",
"description": "Which modality from the input MuData file to process.\n",
"help_text": "Type: `string`, multiple: `False`, default: `\"rna\"`. ",
"default": "rna"
},
"layer": {
"type": "string",
"description": "Use specified layer for expression values instead of the .X object from the modality.",
"help_text": "Type: `string`, multiple: `False`. "
},
"var_input": {
"type": "string",
"description": "Column name in .var matrix that will be used to select which genes to run the PCA on.",
"help_text": "Type: `string`, multiple: `False`, example: `\"filter_with_hvg\"`. "
},
"output": {
"type": "string",
"format": "path",
"description": "Output h5mu file.",
"help_text": "Type: `file`, multiple: `False`, required, default: `\"$id.$key.output.h5mu\"`, direction: `output`, example: `\"output.h5mu\"`. ",
"default": "$id.$key.output.h5mu"
},
"obsm_output": {
"type": "string",
"description": "In which .obsm slot to store the resulting embedding.",
"help_text": "Type: `string`, multiple: `False`, default: `\"X_pca\"`. ",
"default": "X_pca"
},
"varm_output": {
"type": "string",
"description": "In which .varm slot to store the resulting loadings matrix.",
"help_text": "Type: `string`, multiple: `False`, default: `\"pca_loadings\"`. ",
"default": "pca_loadings"
},
"uns_output": {
"type": "string",
"description": "In which .uns slot to store the resulting variance objects.",
"help_text": "Type: `string`, multiple: `False`, default: `\"pca_variance\"`. ",
"default": "pca_variance"
},
"num_components": {
"type": "integer",
"description": "Number of principal components to compute",
"help_text": "Type: `integer`, multiple: `False`, example: `25`. "
},
"overwrite": {
"type": "boolean",
"description": "Allow overwriting .obsm, .varm and .uns slots.",
"help_text": "Type: `boolean_true`, multiple: `False`, default: `false`. ",
"default": false
},
"output_compression": {
"type": "string",
"description": "Compression format to use for the output AnnData and/or Mudata objects.\nBy default no compression is applied.\n",
"help_text": "Type: `string`, multiple: `False`, example: `\"gzip\"`, choices: ``gzip`, `lzf``. ",
"enum": [
"gzip",
"lzf"
]
}
}
},
"nextflow input-output arguments": {
"title": "Nextflow input-output arguments",
"type": "object",
"description": "Input/output parameters for Nextflow itself. Please note that both publishDir and publish_dir are supported but at least one has to be configured.",
"properties": {
"publish_dir": {
"type": "string",
"description": "Path to an output directory.",
"help_text": "Type: `string`, multiple: `False`, required, example: `\"output/\"`. "
}
}
}
},
"allOf": [
{
"$ref": "#/$defs/arguments"
},
{
"$ref": "#/$defs/nextflow input-output arguments"
}
]
}

View File

@@ -0,0 +1,12 @@
def setup_logger():
import logging
from sys import stdout
logger = logging.getLogger()
logger.setLevel(logging.INFO)
console_handler = logging.StreamHandler(stdout)
logFormatter = logging.Formatter("%(asctime)s %(levelname)-8s %(message)s")
console_handler.setFormatter(logFormatter)
logger.addHandler(console_handler)
return logger

View File

@@ -0,0 +1,372 @@
name: "umap"
namespace: "dimred"
version: "3.0.0"
authors:
- name: "Dries De Maeyer"
roles:
- "maintainer"
info:
role: "Core Team Member"
links:
email: "ddemaeyer@gmail.com"
github: "ddemaeyer"
linkedin: "dries-de-maeyer-b46a814"
organizations:
- name: "Janssen Pharmaceuticals"
href: "https://www.janssen.com"
role: "Principal Scientist"
argument_groups:
- name: "Inputs"
arguments:
- type: "file"
name: "--input"
description: "Input h5mu file"
info: null
example:
- "input.h5mu"
must_exist: true
create_parent: true
required: true
direction: "input"
multiple: false
multiple_sep: ";"
- type: "string"
name: "--modality"
description: "Which modality from the input MuData file to process.\n"
info: null
default:
- "rna"
required: false
direction: "input"
multiple: false
multiple_sep: ";"
- type: "string"
name: "--uns_neighbors"
description: "The `.uns` neighbors slot as output by the `find_neighbors` component."
info: null
default:
- "neighbors"
required: false
direction: "input"
multiple: false
multiple_sep: ";"
- name: "Outputs"
arguments:
- type: "file"
name: "--output"
alternatives:
- "-o"
description: "Output h5mu file."
info: null
example:
- "output.h5mu"
must_exist: true
create_parent: true
required: true
direction: "output"
multiple: false
multiple_sep: ";"
- type: "string"
name: "--obsm_output"
description: "The pre/postfix under which to store the UMAP results."
info: null
default:
- "umap"
required: false
direction: "input"
multiple: false
multiple_sep: ";"
- type: "string"
name: "--output_compression"
description: "Compression format to use for the output AnnData and/or Mudata objects.\n\
By default no compression is applied.\n"
info: null
example:
- "gzip"
required: false
choices:
- "gzip"
- "lzf"
direction: "input"
multiple: false
multiple_sep: ";"
- name: "Arguments"
arguments:
- type: "double"
name: "--min_dist"
description: "The effective minimum distance between embedded points. Smaller\
\ values will result in a more clustered/clumped embedding where nearby points\
\ on the manifold are drawn closer together, while larger values will result\
\ on a more even dispersal of points. The value should be set relative to the\
\ spread value, which determines the scale at which embedded points will be\
\ spread out."
info: null
default:
- 0.5
required: false
direction: "input"
multiple: false
multiple_sep: ";"
- type: "double"
name: "--spread"
description: "The effective scale of embedded points. In combination with `min_dist`\
\ this determines how clustered/clumped the embedded points are."
info: null
default:
- 1.0
required: false
direction: "input"
multiple: false
multiple_sep: ";"
- type: "integer"
name: "--num_components"
description: "The number of dimensions of the embedding."
info: null
default:
- 2
required: false
direction: "input"
multiple: false
multiple_sep: ";"
- type: "integer"
name: "--max_iter"
description: "The number of iterations (epochs) of the optimization. Called `n_epochs`\
\ in the original UMAP. Default is set to 500 if neighbors['connectivities'].shape[0]\
\ <= 10000, else 200."
info: null
required: false
direction: "input"
multiple: false
multiple_sep: ";"
- type: "double"
name: "--alpha"
description: "The initial learning rate for the embedding optimization."
info: null
default:
- 1.0
required: false
direction: "input"
multiple: false
multiple_sep: ";"
- type: "double"
name: "--gamma"
description: "Weighting applied to negative samples in low dimensional embedding\
\ optimization. Values higher than one will result in greater weight being given\
\ to negative samples."
info: null
default:
- 1.0
required: false
direction: "input"
multiple: false
multiple_sep: ";"
- type: "integer"
name: "--negative_sample_rate"
description: "The number of negative edge/1-simplex samples to use per positive\
\ edge/1-simplex sample in optimizing the low dimensional embedding."
info: null
default:
- 5
required: false
direction: "input"
multiple: false
multiple_sep: ";"
- type: "string"
name: "--init_pos"
description: "How to initialize the low dimensional embedding. Called `init` in\
\ the original UMAP. Options are:\n \n* Any key from `.obsm`\n* `'paga'`: positions\
\ from `paga()`\n* `'spectral'`: use a spectral embedding of the graph\n* `'random'`:\
\ assign initial embedding positions at random.\n"
info: null
default:
- "spectral"
required: false
direction: "input"
multiple: false
multiple_sep: ";"
resources:
- type: "python_script"
path: "script.py"
is_executable: true
- type: "file"
path: "setup_logger.py"
- type: "file"
path: "compress_h5mu.py"
- type: "file"
path: "nextflow_labels.config"
dest: "nextflow_labels.config"
description: "UMAP (Uniform Manifold Approximation and Projection) is a manifold learning\
\ technique suitable for visualizing high-dimensional data. Besides tending to be\
\ faster than tSNE, it optimizes the embedding such that it best reflects the topology\
\ of the data, which we represent throughout Scanpy using a neighborhood graph.\
\ tSNE, by contrast, optimizes the distribution of nearest-neighbor distances in\
\ the embedding such that these best match the distribution of distances in the\
\ high-dimensional space. We use the implementation of umap-learn [McInnes18]. For\
\ a few comparisons of UMAP with tSNE, see this preprint.\n"
test_resources:
- type: "python_script"
path: "test.py"
is_executable: true
- type: "file"
path: "pbmc_1k_protein_v3"
info: null
status: "enabled"
scope:
image: "public"
target: "public"
license: "MIT"
links:
repository: "https://github.com/openpipelines-bio/openpipeline"
docker_registry: "ghcr.io"
runners:
- type: "executable"
id: "executable"
docker_setup_strategy: "ifneedbepullelsecachedbuild"
- type: "nextflow"
id: "nextflow"
directives:
label:
- "highcpu"
- "midmem"
- "middisk"
tag: "$id"
auto:
simplifyInput: true
simplifyOutput: false
transcript: false
publish: false
config:
labels:
mem1gb: "memory = 1000000000.B"
mem2gb: "memory = 2000000000.B"
mem5gb: "memory = 5000000000.B"
mem10gb: "memory = 10000000000.B"
mem20gb: "memory = 20000000000.B"
mem50gb: "memory = 50000000000.B"
mem100gb: "memory = 100000000000.B"
mem200gb: "memory = 200000000000.B"
mem500gb: "memory = 500000000000.B"
mem1tb: "memory = 1000000000000.B"
mem2tb: "memory = 2000000000000.B"
mem5tb: "memory = 5000000000000.B"
mem10tb: "memory = 10000000000000.B"
mem20tb: "memory = 20000000000000.B"
mem50tb: "memory = 50000000000000.B"
mem100tb: "memory = 100000000000000.B"
mem200tb: "memory = 200000000000000.B"
mem500tb: "memory = 500000000000000.B"
mem1gib: "memory = 1073741824.B"
mem2gib: "memory = 2147483648.B"
mem4gib: "memory = 4294967296.B"
mem8gib: "memory = 8589934592.B"
mem16gib: "memory = 17179869184.B"
mem32gib: "memory = 34359738368.B"
mem64gib: "memory = 68719476736.B"
mem128gib: "memory = 137438953472.B"
mem256gib: "memory = 274877906944.B"
mem512gib: "memory = 549755813888.B"
mem1tib: "memory = 1099511627776.B"
mem2tib: "memory = 2199023255552.B"
mem4tib: "memory = 4398046511104.B"
mem8tib: "memory = 8796093022208.B"
mem16tib: "memory = 17592186044416.B"
mem32tib: "memory = 35184372088832.B"
mem64tib: "memory = 70368744177664.B"
mem128tib: "memory = 140737488355328.B"
mem256tib: "memory = 281474976710656.B"
mem512tib: "memory = 562949953421312.B"
cpu1: "cpus = 1"
cpu2: "cpus = 2"
cpu5: "cpus = 5"
cpu10: "cpus = 10"
cpu20: "cpus = 20"
cpu50: "cpus = 50"
cpu100: "cpus = 100"
cpu200: "cpus = 200"
cpu500: "cpus = 500"
cpu1000: "cpus = 1000"
script:
- "includeConfig(\"nextflow_labels.config\")"
debug: false
container: "docker"
engines:
- type: "docker"
id: "docker"
image: "python:3.12-slim"
target_tag: "3.0.0"
namespace_separator: "/"
setup:
- type: "apt"
packages:
- "procps"
interactive: false
- type: "python"
user: false
packages:
- "anndata~=0.11.1"
- "mudata~=0.3.1"
- "scanpy~=1.10.4"
script:
- "exec(\"try:\\n import awkward\\nexcept ModuleNotFoundError:\\n exit(0)\\\
nelse: exit(1)\")"
upgrade: true
test_setup:
- type: "python"
user: false
packages:
- "viashpy==0.8.0"
upgrade: true
entrypoint: []
cmd: null
build_info:
config: "src/dimred/umap/config.vsh.yaml"
runner: "nextflow"
engine: "docker"
output: "target/nextflow/dimred/umap"
executable: "target/nextflow/dimred/umap/main.nf"
viash_version: "0.9.4"
git_commit: "706b5ce24d313dcf947b7d9fe929630f1ad204e7"
git_remote: "https://github.com/openpipelines-bio/openpipeline"
git_tag: "2.1.0-2-g706b5ce24d3"
package_config:
name: "openpipeline"
summary: "Best-practice workflows for single-cell multi-omics analyses.\n"
description: "OpenPipelines are extensible single cell analysis pipelines for reproducible\
\ and large-scale single cell processing using [Viash](https://viash.io) and [Nextflow](https://www.nextflow.io/).\n\
\nIn terms of workflows, the following has been made available, but keep in mind\
\ that\nindividual tools and functionality can be executed as standalone components\
\ as well.\n\n * Demultiplexing: conversion of raw sequencing data to FASTQ objects.\n\
\ * Ingestion: Read mapping and generating a count matrix.\n * Single sample\
\ processing: cell filtering and doublet detection.\n * Multisample processing:\
\ Count transformation, normalization, QC metric calulations.\n * Integration:\
\ Clustering, integration and batch correction using single and multimodal methods.\n\
\ * Downstream analysis workflows\n"
info:
test_resources:
- type: "s3"
path: "s3://openpipelines-data"
dest: "resources_test"
nextflow_labels_ci:
- path: "src/workflows/utils/labels_ci.config"
description: "Adds the correct memory and CPU labels when running on the Viash\
\ Hub CI."
viash_version: "0.9.4"
source: "src"
target: "target"
config_mods:
- ".resources += {path: '/src/workflows/utils/labels.config', dest: 'nextflow_labels.config'}\n\
.runners[.type == 'nextflow'].config.script := 'includeConfig(\"nextflow_labels.config\"\
)'"
- ".version := \"3.0.0\""
- ".engines[.type == 'docker'].target_tag := '3.0.0'"
keywords:
- "single-cell"
- "multimodal"
license: "MIT"
organization: "openpipelines-bio"
links:
repository: "https://github.com/openpipelines-bio/openpipeline"
docker_registry: "ghcr.io"
homepage: "https://openpipelines.bio"
documentation: "https://openpipelines.bio/fundamentals"
issue_tracker: "https://github.com/openpipelines-bio/openpipeline/issues"

View File

@@ -0,0 +1,87 @@
import shutil
from anndata import AnnData
from mudata import write_h5ad
from h5py import File as H5File
from h5py import Group, Dataset
from pathlib import Path
from typing import Union, Literal
from functools import partial
def compress_h5mu(
input_path: Union[str, Path],
output_path: Union[str, Path],
compression: Union[Literal["gzip"], Literal["lzf"]],
):
input_path, output_path = str(input_path), str(output_path)
def copy_attributes(in_object, out_object):
for key, value in in_object.attrs.items():
out_object.attrs[key] = value
def visit_path(
output_h5: H5File,
compression: Union[Literal["gzip"], Literal["lzf"]],
name: str,
object: Union[Group, Dataset],
):
if isinstance(object, Group):
new_group = output_h5.create_group(name)
copy_attributes(object, new_group)
elif isinstance(object, Dataset):
# Compression only works for non-scalar Dataset objects
# Scalar objects dont have a shape defined
if not object.compression and object.shape not in [None, ()]:
new_dataset = output_h5.create_dataset(
name, data=object, compression=compression
)
copy_attributes(object, new_dataset)
else:
output_h5.copy(object, name)
else:
raise NotImplementedError(
f"Could not copy element {name}, "
f"type has not been implemented yet: {type(object)}"
)
with (
H5File(input_path, "r") as input_h5,
H5File(output_path, "w", userblock_size=512) as output_h5,
):
copy_attributes(input_h5, output_h5)
input_h5.visititems(partial(visit_path, output_h5, compression))
with open(input_path, "rb") as input_bytes:
# Mudata puts metadata like this in the first 512 bytes:
# MuData (format-version=0.1.0;creator=muon;creator-version=0.2.0)
# See mudata/_core/io.py, read_h5mu() function
starting_metadata = input_bytes.read(100)
# The metadata is padded with extra null bytes up until 512 bytes
truncate_location = starting_metadata.find(b"\x00")
starting_metadata = starting_metadata[:truncate_location]
with open(output_path, "br+") as f:
nbytes = f.write(starting_metadata)
f.write(b"\0" * (512 - nbytes))
def write_h5ad_to_h5mu_with_compression(
output_file: Union[str, Path],
h5mu: Union[str, Path],
modality_name: str,
modality_data: AnnData,
output_compression=None,
):
output_file = Path(output_file)
h5mu = Path(h5mu)
output_file_uncompressed = (
output_file.with_name(output_file.stem + "_uncompressed.h5mu")
if output_compression
else output_file
)
shutil.copyfile(h5mu, output_file_uncompressed)
write_h5ad(filename=output_file_uncompressed, mod=modality_name, data=modality_data)
if output_compression:
compress_h5mu(
output_file_uncompressed, output_file, compression=output_compression
)
output_file_uncompressed.unlink()

View File

@@ -0,0 +1,126 @@
manifest {
name = 'dimred/umap'
mainScript = 'main.nf'
nextflowVersion = '!>=20.12.1-edge'
version = '3.0.0'
description = 'UMAP (Uniform Manifold Approximation and Projection) is a manifold learning technique suitable for visualizing high-dimensional data. Besides tending to be faster than tSNE, it optimizes the embedding such that it best reflects the topology of the data, which we represent throughout Scanpy using a neighborhood graph. tSNE, by contrast, optimizes the distribution of nearest-neighbor distances in the embedding such that these best match the distribution of distances in the high-dimensional space. We use the implementation of umap-learn [McInnes18]. For a few comparisons of UMAP with tSNE, see this preprint.\n'
author = 'Dries De Maeyer'
}
process.container = 'nextflow/bash:latest'
// detect tempdir
tempDir = java.nio.file.Paths.get(
System.getenv('NXF_TEMP') ?:
System.getenv('VIASH_TEMP') ?:
System.getenv('TEMPDIR') ?:
System.getenv('TMPDIR') ?:
'/tmp'
).toAbsolutePath()
profiles {
no_publish {
process {
withName: '.*' {
publishDir = [
enabled: false
]
}
}
}
mount_temp {
docker.temp = tempDir
podman.temp = tempDir
charliecloud.temp = tempDir
}
docker {
docker.enabled = true
// docker.userEmulation = true
singularity.enabled = false
podman.enabled = false
shifter.enabled = false
charliecloud.enabled = false
}
singularity {
singularity.enabled = true
singularity.autoMounts = true
docker.enabled = false
podman.enabled = false
shifter.enabled = false
charliecloud.enabled = false
}
podman {
podman.enabled = true
docker.enabled = false
singularity.enabled = false
shifter.enabled = false
charliecloud.enabled = false
}
shifter {
shifter.enabled = true
docker.enabled = false
singularity.enabled = false
podman.enabled = false
charliecloud.enabled = false
}
charliecloud {
charliecloud.enabled = true
docker.enabled = false
singularity.enabled = false
podman.enabled = false
shifter.enabled = false
}
}
process{
withLabel: mem1gb { memory = 1000000000.B }
withLabel: mem2gb { memory = 2000000000.B }
withLabel: mem5gb { memory = 5000000000.B }
withLabel: mem10gb { memory = 10000000000.B }
withLabel: mem20gb { memory = 20000000000.B }
withLabel: mem50gb { memory = 50000000000.B }
withLabel: mem100gb { memory = 100000000000.B }
withLabel: mem200gb { memory = 200000000000.B }
withLabel: mem500gb { memory = 500000000000.B }
withLabel: mem1tb { memory = 1000000000000.B }
withLabel: mem2tb { memory = 2000000000000.B }
withLabel: mem5tb { memory = 5000000000000.B }
withLabel: mem10tb { memory = 10000000000000.B }
withLabel: mem20tb { memory = 20000000000000.B }
withLabel: mem50tb { memory = 50000000000000.B }
withLabel: mem100tb { memory = 100000000000000.B }
withLabel: mem200tb { memory = 200000000000000.B }
withLabel: mem500tb { memory = 500000000000000.B }
withLabel: mem1gib { memory = 1073741824.B }
withLabel: mem2gib { memory = 2147483648.B }
withLabel: mem4gib { memory = 4294967296.B }
withLabel: mem8gib { memory = 8589934592.B }
withLabel: mem16gib { memory = 17179869184.B }
withLabel: mem32gib { memory = 34359738368.B }
withLabel: mem64gib { memory = 68719476736.B }
withLabel: mem128gib { memory = 137438953472.B }
withLabel: mem256gib { memory = 274877906944.B }
withLabel: mem512gib { memory = 549755813888.B }
withLabel: mem1tib { memory = 1099511627776.B }
withLabel: mem2tib { memory = 2199023255552.B }
withLabel: mem4tib { memory = 4398046511104.B }
withLabel: mem8tib { memory = 8796093022208.B }
withLabel: mem16tib { memory = 17592186044416.B }
withLabel: mem32tib { memory = 35184372088832.B }
withLabel: mem64tib { memory = 70368744177664.B }
withLabel: mem128tib { memory = 140737488355328.B }
withLabel: mem256tib { memory = 281474976710656.B }
withLabel: mem512tib { memory = 562949953421312.B }
withLabel: cpu1 { cpus = 1 }
withLabel: cpu2 { cpus = 2 }
withLabel: cpu5 { cpus = 5 }
withLabel: cpu10 { cpus = 10 }
withLabel: cpu20 { cpus = 20 }
withLabel: cpu50 { cpus = 50 }
withLabel: cpu100 { cpus = 100 }
withLabel: cpu200 { cpus = 200 }
withLabel: cpu500 { cpus = 500 }
withLabel: cpu1000 { cpus = 1000 }
}
includeConfig("nextflow_labels.config")

View File

@@ -0,0 +1,48 @@
process {
// Default resources for components that hardly do any processing
memory = { 2.GB * task.attempt }
cpus = 1
// Retry for exit codes that have something to do with memory issues
errorStrategy = { task.exitStatus in 137..140 ? 'retry' : 'terminate' }
maxRetries = 3
// The memory a task is assinged increases with each attempt
// uncomment the line below and adjust the value to set a global upper limit on the memory.
// resourceLimits = [ memory: 240.Gb ]
// CPU resources
withLabel: singlecpu { cpus = 1 }
withLabel: lowcpu { cpus = 4 }
withLabel: midcpu { cpus = 10 }
withLabel: highcpu { cpus = 20 }
// Memory resources
withLabel: lowmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 4.GB * task.attempt } }
withLabel: midmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 25.GB * task.attempt } }
withLabel: highmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 50.GB * task.attempt } }
withLabel: veryhighmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 75.GB * task.attempt } }
// Disk space
withLabel: lowdisk {
disk = {process.disk ? process.disk : null}
}
withLabel: middisk {
disk = {process.disk ? process.disk : null}
}
withLabel: highdisk {
disk = {process.disk ? process.disk : null}
}
withLabel: veryhighdisk {
disk = {process.disk ? process.disk : null}
}
// NOTE: The above labels intentionally do not have an effect by default.
// The user should set the disk space requirements by adding the following
// to the compute environment:
//
// withLabel: lowdisk { disk = { 20.GB * task.attempt } }
// withLabel: middisk { disk = { 100.GB * task.attempt } }
// withLabel: highdisk { disk = { 200.GB * task.attempt } }
// withLabel: veryhighdisk { disk = { 500.GB * task.attempt } }
}

View File

@@ -0,0 +1,23 @@
# Inputs
input: # please fill in - example: "input.h5mu"
modality: "rna"
uns_neighbors: "neighbors"
# Outputs
# output: "$id.$key.output.h5mu"
obsm_output: "umap"
# output_compression: "gzip"
# Arguments
min_dist: 0.5
spread: 1.0
num_components: 2
# max_iter: 123
alpha: 1.0
gamma: 1.0
negative_sample_rate: 5
init_pos: "spectral"
# Nextflow input-output arguments
publish_dir: # please fill in - example: "output/"
# param_list: "my_params.yaml"

View File

@@ -0,0 +1,157 @@
{
"$schema": "https://json-schema.org/draft/2020-12/schema",
"title": "umap",
"description": "UMAP (Uniform Manifold Approximation and Projection) is a manifold learning technique suitable for visualizing high-dimensional data. Besides tending to be faster than tSNE, it optimizes the embedding such that it best reflects the topology of the data, which we represent throughout Scanpy using a neighborhood graph. tSNE, by contrast, optimizes the distribution of nearest-neighbor distances in the embedding such that these best match the distribution of distances in the high-dimensional space. We use the implementation of umap-learn [McInnes18]. For a few comparisons of UMAP with tSNE, see this preprint.\n",
"type": "object",
"$defs": {
"Dataset input": {
"title": "Dataset input",
"type": "object",
"description": "Dataset input using nf-tower \"dataset\" or \"data explorer\". Allows for the input of multiple parameter sets to initialise a Nextflow channel.",
"properties": {
"param_list": {
"description": "Dataset input can either be a list of maps, a csv file, a json file, a yaml file, or simply a yaml blob. The names of the input fields (e.g. csv columns, json keys) need to be an exact match with the workflow input parameters.",
"type": "string",
"default": "",
"format": "file-path",
"mimetype": "text/csv"
}
}
},
"inputs": {
"title": "Inputs",
"type": "object",
"description": "No description",
"properties": {
"input": {
"type": "string",
"format": "path",
"exists": true,
"description": "Input h5mu file",
"help_text": "Type: `file`, multiple: `False`, required, direction: `input`, example: `\"input.h5mu\"`. "
},
"modality": {
"type": "string",
"description": "Which modality from the input MuData file to process.\n",
"help_text": "Type: `string`, multiple: `False`, default: `\"rna\"`. ",
"default": "rna"
},
"uns_neighbors": {
"type": "string",
"description": "The `.uns` neighbors slot as output by the `find_neighbors` component.",
"help_text": "Type: `string`, multiple: `False`, default: `\"neighbors\"`. ",
"default": "neighbors"
}
}
},
"outputs": {
"title": "Outputs",
"type": "object",
"description": "No description",
"properties": {
"output": {
"type": "string",
"format": "path",
"description": "Output h5mu file.",
"help_text": "Type: `file`, multiple: `False`, required, default: `\"$id.$key.output.h5mu\"`, direction: `output`, example: `\"output.h5mu\"`. ",
"default": "$id.$key.output.h5mu"
},
"obsm_output": {
"type": "string",
"description": "The pre/postfix under which to store the UMAP results.",
"help_text": "Type: `string`, multiple: `False`, default: `\"umap\"`. ",
"default": "umap"
},
"output_compression": {
"type": "string",
"description": "Compression format to use for the output AnnData and/or Mudata objects.\nBy default no compression is applied.\n",
"help_text": "Type: `string`, multiple: `False`, example: `\"gzip\"`, choices: ``gzip`, `lzf``. ",
"enum": [
"gzip",
"lzf"
]
}
}
},
"arguments": {
"title": "Arguments",
"type": "object",
"description": "No description",
"properties": {
"min_dist": {
"type": "number",
"description": "The effective minimum distance between embedded points",
"help_text": "Type: `double`, multiple: `False`, default: `0.5`. ",
"default": 0.5
},
"spread": {
"type": "number",
"description": "The effective scale of embedded points",
"help_text": "Type: `double`, multiple: `False`, default: `1.0`. ",
"default": 1.0
},
"num_components": {
"type": "integer",
"description": "The number of dimensions of the embedding.",
"help_text": "Type: `integer`, multiple: `False`, default: `2`. ",
"default": 2
},
"max_iter": {
"type": "integer",
"description": "The number of iterations (epochs) of the optimization",
"help_text": "Type: `integer`, multiple: `False`. "
},
"alpha": {
"type": "number",
"description": "The initial learning rate for the embedding optimization.",
"help_text": "Type: `double`, multiple: `False`, default: `1.0`. ",
"default": 1.0
},
"gamma": {
"type": "number",
"description": "Weighting applied to negative samples in low dimensional embedding optimization",
"help_text": "Type: `double`, multiple: `False`, default: `1.0`. ",
"default": 1.0
},
"negative_sample_rate": {
"type": "integer",
"description": "The number of negative edge/1-simplex samples to use per positive edge/1-simplex sample in optimizing the low dimensional embedding.",
"help_text": "Type: `integer`, multiple: `False`, default: `5`. ",
"default": 5
},
"init_pos": {
"type": "string",
"description": "How to initialize the low dimensional embedding",
"help_text": "Type: `string`, multiple: `False`, default: `\"spectral\"`. ",
"default": "spectral"
}
}
},
"nextflow input-output arguments": {
"title": "Nextflow input-output arguments",
"type": "object",
"description": "Input/output parameters for Nextflow itself. Please note that both publishDir and publish_dir are supported but at least one has to be configured.",
"properties": {
"publish_dir": {
"type": "string",
"description": "Path to an output directory.",
"help_text": "Type: `string`, multiple: `False`, required, example: `\"output/\"`. "
}
}
}
},
"allOf": [
{
"$ref": "#/$defs/inputs"
},
{
"$ref": "#/$defs/outputs"
},
{
"$ref": "#/$defs/arguments"
},
{
"$ref": "#/$defs/nextflow input-output arguments"
}
]
}

View File

@@ -0,0 +1,12 @@
def setup_logger():
import logging
from sys import stdout
logger = logging.getLogger()
logger.setLevel(logging.INFO)
console_handler = logging.StreamHandler(stdout)
logFormatter = logging.Formatter("%(asctime)s %(levelname)-8s %(message)s")
console_handler.setFormatter(logFormatter)
logger.addHandler(console_handler)
return logger

View File

@@ -0,0 +1,423 @@
name: "highly_variable_features_scanpy"
namespace: "feature_annotation"
version: "3.0.0"
authors:
- name: "Dries De Maeyer"
roles:
- "contributor"
info:
role: "Core Team Member"
links:
email: "ddemaeyer@gmail.com"
github: "ddemaeyer"
linkedin: "dries-de-maeyer-b46a814"
organizations:
- name: "Janssen Pharmaceuticals"
href: "https://www.janssen.com"
role: "Principal Scientist"
- name: "Robrecht Cannoodt"
roles:
- "maintainer"
- "contributor"
info:
role: "Core Team Member"
links:
email: "robrecht@data-intuitive.com"
github: "rcannood"
orcid: "0000-0003-3641-729X"
linkedin: "robrechtcannoodt"
organizations:
- name: "Data Intuitive"
href: "https://www.data-intuitive.com"
role: "Data Science Engineer"
- name: "Open Problems"
href: "https://openproblems.bio"
role: "Core Member"
argument_groups:
- name: "Arguments"
arguments:
- type: "file"
name: "--input"
description: "Input h5mu file"
info: null
example:
- "input.h5mu"
must_exist: true
create_parent: true
required: true
direction: "input"
multiple: false
multiple_sep: ";"
- type: "string"
name: "--modality"
description: "Which modality from the input MuData file to process.\n"
info: null
default:
- "rna"
required: false
direction: "input"
multiple: false
multiple_sep: ";"
- type: "string"
name: "--layer"
description: "use adata.layers[layer] for expression values instead of adata.X."
info: null
required: false
direction: "input"
multiple: false
multiple_sep: ";"
- type: "string"
name: "--var_input"
description: "If specified, use boolean array in adata.var[var_input] to calculate\
\ hvg on subset of vars.\n"
info: null
required: false
direction: "input"
multiple: false
multiple_sep: ";"
- type: "file"
name: "--output"
description: "Output h5mu file."
info: null
example:
- "output.h5mu"
must_exist: true
create_parent: true
required: false
direction: "output"
multiple: false
multiple_sep: ";"
- type: "string"
name: "--var_name_filter"
description: "In which .var slot to store a boolean array corresponding to which\
\ observations should be filtered out."
info: null
default:
- "filter_with_hvg"
required: false
direction: "input"
multiple: false
multiple_sep: ";"
- type: "string"
name: "--varm_name"
description: "In which .varm slot to store additional metadata."
info: null
default:
- "hvg"
required: false
direction: "input"
multiple: false
multiple_sep: ";"
- type: "string"
name: "--flavor"
description: "Choose the flavor for identifying highly variable features. For\
\ the dispersion based methods\nin their default workflows, Seurat passes the\
\ cutoffs whereas Cell Ranger passes n_top_features.\n"
info: null
default:
- "seurat"
required: false
choices:
- "seurat"
- "cell_ranger"
- "seurat_v3"
direction: "input"
multiple: false
multiple_sep: ";"
- type: "integer"
name: "--n_top_features"
description: "Number of highly-variable features to keep. Mandatory if flavor='seurat_v3'."
info: null
required: false
direction: "input"
multiple: false
multiple_sep: ";"
- type: "double"
name: "--min_mean"
description: "If n_top_features is defined, this and all other cutoffs for the\
\ means and the normalized dispersions are ignored. Ignored if flavor='seurat_v3'."
info: null
default:
- 0.0125
required: false
direction: "input"
multiple: false
multiple_sep: ";"
- type: "double"
name: "--max_mean"
description: "If n_top_features is defined, this and all other cutoffs for the\
\ means and the normalized dispersions are ignored. Ignored if flavor='seurat_v3'."
info: null
default:
- 3.0
required: false
direction: "input"
multiple: false
multiple_sep: ";"
- type: "double"
name: "--min_disp"
description: "If n_top_features is defined, this and all other cutoffs for the\
\ means and the normalized dispersions are ignored. Ignored if flavor='seurat_v3'."
info: null
default:
- 0.5
required: false
direction: "input"
multiple: false
multiple_sep: ";"
- type: "double"
name: "--max_disp"
description: "If n_top_features is defined, this and all other cutoffs for the\
\ means and the normalized dispersions are ignored. Ignored if flavor='seurat_v3'.\
\ Default is +inf."
info: null
required: false
direction: "input"
multiple: false
multiple_sep: ";"
- type: "double"
name: "--span"
description: "The fraction of the data (cells) used when estimating the variance\
\ in the loess model fit if flavor='seurat_v3'."
info: null
default:
- 0.3
required: false
direction: "input"
multiple: false
multiple_sep: ";"
- type: "integer"
name: "--n_bins"
description: "Number of bins for binning the mean feature expression. Normalization\
\ is done with respect to each bin. If just a single feature falls into a bin,\
\ the normalized dispersion is artificially set to 1."
info: null
default:
- 20
required: false
direction: "input"
multiple: false
multiple_sep: ";"
- type: "string"
name: "--obs_batch_key"
description: "If specified, highly-variable features are selected within each\
\ batch separately and merged. This simple \nprocess avoids the selection of\
\ batch-specific features and acts as a lightweight batch correction method.\
\ \nFor all flavors, features are first sorted by how many batches they are\
\ a HVG. For dispersion-based flavors \nties are broken by normalized dispersion.\
\ If flavor = 'seurat_v3', ties are broken by the median (across\nbatches) rank\
\ based on within-batch normalized variance.\n"
info: null
required: false
direction: "input"
multiple: false
multiple_sep: ";"
- type: "string"
name: "--output_compression"
description: "Compression format to use for the output AnnData and/or Mudata objects.\n\
By default no compression is applied.\n"
info: null
example:
- "gzip"
required: false
choices:
- "gzip"
- "lzf"
direction: "input"
multiple: false
multiple_sep: ";"
resources:
- type: "python_script"
path: "script.py"
is_executable: true
- type: "file"
path: "setup_logger.py"
- type: "file"
path: "compress_h5mu.py"
- type: "file"
path: "subset_vars.py"
- type: "file"
path: "nextflow_labels.config"
dest: "nextflow_labels.config"
description: "Annotate highly variable features [Satija15] [Zheng17] [Stuart19].\n\
\nExpects logarithmized data, except when flavor='seurat_v3' in which count data\
\ is expected.\n\nDepending on flavor, this reproduces the R-implementations of\
\ Seurat [Satija15], Cell Ranger [Zheng17], and Seurat v3 [Stuart19].\n\nFor the\
\ dispersion-based methods ([Satija15] and [Zheng17]), the normalized dispersion\
\ is obtained by scaling with the mean and standard deviation of the dispersions\
\ for features falling into a given bin for mean expression of features. This means\
\ that for each bin of mean expression, highly variable features are selected.\n\
\nFor [Stuart19], a normalized variance for each feature is computed. First, the\
\ data are standardized (i.e., z-score normalization per feature) with a regularized\
\ standard deviation. Next, the normalized variance is computed as the variance\
\ of each feature after the transformation. Features are ranked by the normalized\
\ variance.\n"
test_resources:
- type: "python_script"
path: "test.py"
is_executable: true
- type: "file"
path: "pbmc_1k_protein_v3"
info: null
status: "enabled"
scope:
image: "public"
target: "public"
license: "MIT"
links:
repository: "https://github.com/openpipelines-bio/openpipeline"
docker_registry: "ghcr.io"
runners:
- type: "executable"
id: "executable"
docker_setup_strategy: "ifneedbepullelsecachedbuild"
- type: "nextflow"
id: "nextflow"
directives:
label:
- "singlecpu"
- "lowmem"
tag: "$id"
auto:
simplifyInput: true
simplifyOutput: false
transcript: false
publish: false
config:
labels:
mem1gb: "memory = 1000000000.B"
mem2gb: "memory = 2000000000.B"
mem5gb: "memory = 5000000000.B"
mem10gb: "memory = 10000000000.B"
mem20gb: "memory = 20000000000.B"
mem50gb: "memory = 50000000000.B"
mem100gb: "memory = 100000000000.B"
mem200gb: "memory = 200000000000.B"
mem500gb: "memory = 500000000000.B"
mem1tb: "memory = 1000000000000.B"
mem2tb: "memory = 2000000000000.B"
mem5tb: "memory = 5000000000000.B"
mem10tb: "memory = 10000000000000.B"
mem20tb: "memory = 20000000000000.B"
mem50tb: "memory = 50000000000000.B"
mem100tb: "memory = 100000000000000.B"
mem200tb: "memory = 200000000000000.B"
mem500tb: "memory = 500000000000000.B"
mem1gib: "memory = 1073741824.B"
mem2gib: "memory = 2147483648.B"
mem4gib: "memory = 4294967296.B"
mem8gib: "memory = 8589934592.B"
mem16gib: "memory = 17179869184.B"
mem32gib: "memory = 34359738368.B"
mem64gib: "memory = 68719476736.B"
mem128gib: "memory = 137438953472.B"
mem256gib: "memory = 274877906944.B"
mem512gib: "memory = 549755813888.B"
mem1tib: "memory = 1099511627776.B"
mem2tib: "memory = 2199023255552.B"
mem4tib: "memory = 4398046511104.B"
mem8tib: "memory = 8796093022208.B"
mem16tib: "memory = 17592186044416.B"
mem32tib: "memory = 35184372088832.B"
mem64tib: "memory = 70368744177664.B"
mem128tib: "memory = 140737488355328.B"
mem256tib: "memory = 281474976710656.B"
mem512tib: "memory = 562949953421312.B"
cpu1: "cpus = 1"
cpu2: "cpus = 2"
cpu5: "cpus = 5"
cpu10: "cpus = 10"
cpu20: "cpus = 20"
cpu50: "cpus = 50"
cpu100: "cpus = 100"
cpu200: "cpus = 200"
cpu500: "cpus = 500"
cpu1000: "cpus = 1000"
script:
- "includeConfig(\"nextflow_labels.config\")"
debug: false
container: "docker"
engines:
- type: "docker"
id: "docker"
image: "python:3.12"
target_tag: "3.0.0"
namespace_separator: "/"
setup:
- type: "python"
user: false
packages:
- "anndata~=0.11.1"
- "mudata~=0.3.1"
- "scanpy~=1.10.4"
- "scikit-misc"
script:
- "exec(\"try:\\n import awkward\\nexcept ModuleNotFoundError:\\n exit(0)\\\
nelse: exit(1)\")"
upgrade: true
test_setup:
- type: "apt"
packages:
- "git"
interactive: false
- type: "python"
user: false
packages:
- "viashpy==0.8.0"
github:
- "openpipelines-bio/core#subdirectory=packages/python/openpipeline_testutils"
upgrade: true
entrypoint: []
cmd: null
build_info:
config: "src/feature_annotation/highly_variable_features_scanpy/config.vsh.yaml"
runner: "nextflow"
engine: "docker"
output: "target/nextflow/feature_annotation/highly_variable_features_scanpy"
executable: "target/nextflow/feature_annotation/highly_variable_features_scanpy/main.nf"
viash_version: "0.9.4"
git_commit: "706b5ce24d313dcf947b7d9fe929630f1ad204e7"
git_remote: "https://github.com/openpipelines-bio/openpipeline"
git_tag: "2.1.0-2-g706b5ce24d3"
package_config:
name: "openpipeline"
summary: "Best-practice workflows for single-cell multi-omics analyses.\n"
description: "OpenPipelines are extensible single cell analysis pipelines for reproducible\
\ and large-scale single cell processing using [Viash](https://viash.io) and [Nextflow](https://www.nextflow.io/).\n\
\nIn terms of workflows, the following has been made available, but keep in mind\
\ that\nindividual tools and functionality can be executed as standalone components\
\ as well.\n\n * Demultiplexing: conversion of raw sequencing data to FASTQ objects.\n\
\ * Ingestion: Read mapping and generating a count matrix.\n * Single sample\
\ processing: cell filtering and doublet detection.\n * Multisample processing:\
\ Count transformation, normalization, QC metric calulations.\n * Integration:\
\ Clustering, integration and batch correction using single and multimodal methods.\n\
\ * Downstream analysis workflows\n"
info:
test_resources:
- type: "s3"
path: "s3://openpipelines-data"
dest: "resources_test"
nextflow_labels_ci:
- path: "src/workflows/utils/labels_ci.config"
description: "Adds the correct memory and CPU labels when running on the Viash\
\ Hub CI."
viash_version: "0.9.4"
source: "src"
target: "target"
config_mods:
- ".resources += {path: '/src/workflows/utils/labels.config', dest: 'nextflow_labels.config'}\n\
.runners[.type == 'nextflow'].config.script := 'includeConfig(\"nextflow_labels.config\"\
)'"
- ".version := \"3.0.0\""
- ".engines[.type == 'docker'].target_tag := '3.0.0'"
keywords:
- "single-cell"
- "multimodal"
license: "MIT"
organization: "openpipelines-bio"
links:
repository: "https://github.com/openpipelines-bio/openpipeline"
docker_registry: "ghcr.io"
homepage: "https://openpipelines.bio"
documentation: "https://openpipelines.bio/fundamentals"
issue_tracker: "https://github.com/openpipelines-bio/openpipeline/issues"

View File

@@ -0,0 +1,87 @@
import shutil
from anndata import AnnData
from mudata import write_h5ad
from h5py import File as H5File
from h5py import Group, Dataset
from pathlib import Path
from typing import Union, Literal
from functools import partial
def compress_h5mu(
input_path: Union[str, Path],
output_path: Union[str, Path],
compression: Union[Literal["gzip"], Literal["lzf"]],
):
input_path, output_path = str(input_path), str(output_path)
def copy_attributes(in_object, out_object):
for key, value in in_object.attrs.items():
out_object.attrs[key] = value
def visit_path(
output_h5: H5File,
compression: Union[Literal["gzip"], Literal["lzf"]],
name: str,
object: Union[Group, Dataset],
):
if isinstance(object, Group):
new_group = output_h5.create_group(name)
copy_attributes(object, new_group)
elif isinstance(object, Dataset):
# Compression only works for non-scalar Dataset objects
# Scalar objects dont have a shape defined
if not object.compression and object.shape not in [None, ()]:
new_dataset = output_h5.create_dataset(
name, data=object, compression=compression
)
copy_attributes(object, new_dataset)
else:
output_h5.copy(object, name)
else:
raise NotImplementedError(
f"Could not copy element {name}, "
f"type has not been implemented yet: {type(object)}"
)
with (
H5File(input_path, "r") as input_h5,
H5File(output_path, "w", userblock_size=512) as output_h5,
):
copy_attributes(input_h5, output_h5)
input_h5.visititems(partial(visit_path, output_h5, compression))
with open(input_path, "rb") as input_bytes:
# Mudata puts metadata like this in the first 512 bytes:
# MuData (format-version=0.1.0;creator=muon;creator-version=0.2.0)
# See mudata/_core/io.py, read_h5mu() function
starting_metadata = input_bytes.read(100)
# The metadata is padded with extra null bytes up until 512 bytes
truncate_location = starting_metadata.find(b"\x00")
starting_metadata = starting_metadata[:truncate_location]
with open(output_path, "br+") as f:
nbytes = f.write(starting_metadata)
f.write(b"\0" * (512 - nbytes))
def write_h5ad_to_h5mu_with_compression(
output_file: Union[str, Path],
h5mu: Union[str, Path],
modality_name: str,
modality_data: AnnData,
output_compression=None,
):
output_file = Path(output_file)
h5mu = Path(h5mu)
output_file_uncompressed = (
output_file.with_name(output_file.stem + "_uncompressed.h5mu")
if output_compression
else output_file
)
shutil.copyfile(h5mu, output_file_uncompressed)
write_h5ad(filename=output_file_uncompressed, mod=modality_name, data=modality_data)
if output_compression:
compress_h5mu(
output_file_uncompressed, output_file, compression=output_compression
)
output_file_uncompressed.unlink()

View File

@@ -0,0 +1,126 @@
manifest {
name = 'feature_annotation/highly_variable_features_scanpy'
mainScript = 'main.nf'
nextflowVersion = '!>=20.12.1-edge'
version = '3.0.0'
description = 'Annotate highly variable features [Satija15] [Zheng17] [Stuart19].\n\nExpects logarithmized data, except when flavor=\'seurat_v3\' in which count data is expected.\n\nDepending on flavor, this reproduces the R-implementations of Seurat [Satija15], Cell Ranger [Zheng17], and Seurat v3 [Stuart19].\n\nFor the dispersion-based methods ([Satija15] and [Zheng17]), the normalized dispersion is obtained by scaling with the mean and standard deviation of the dispersions for features falling into a given bin for mean expression of features. This means that for each bin of mean expression, highly variable features are selected.\n\nFor [Stuart19], a normalized variance for each feature is computed. First, the data are standardized (i.e., z-score normalization per feature) with a regularized standard deviation. Next, the normalized variance is computed as the variance of each feature after the transformation. Features are ranked by the normalized variance.\n'
author = 'Dries De Maeyer, Robrecht Cannoodt'
}
process.container = 'nextflow/bash:latest'
// detect tempdir
tempDir = java.nio.file.Paths.get(
System.getenv('NXF_TEMP') ?:
System.getenv('VIASH_TEMP') ?:
System.getenv('TEMPDIR') ?:
System.getenv('TMPDIR') ?:
'/tmp'
).toAbsolutePath()
profiles {
no_publish {
process {
withName: '.*' {
publishDir = [
enabled: false
]
}
}
}
mount_temp {
docker.temp = tempDir
podman.temp = tempDir
charliecloud.temp = tempDir
}
docker {
docker.enabled = true
// docker.userEmulation = true
singularity.enabled = false
podman.enabled = false
shifter.enabled = false
charliecloud.enabled = false
}
singularity {
singularity.enabled = true
singularity.autoMounts = true
docker.enabled = false
podman.enabled = false
shifter.enabled = false
charliecloud.enabled = false
}
podman {
podman.enabled = true
docker.enabled = false
singularity.enabled = false
shifter.enabled = false
charliecloud.enabled = false
}
shifter {
shifter.enabled = true
docker.enabled = false
singularity.enabled = false
podman.enabled = false
charliecloud.enabled = false
}
charliecloud {
charliecloud.enabled = true
docker.enabled = false
singularity.enabled = false
podman.enabled = false
shifter.enabled = false
}
}
process{
withLabel: mem1gb { memory = 1000000000.B }
withLabel: mem2gb { memory = 2000000000.B }
withLabel: mem5gb { memory = 5000000000.B }
withLabel: mem10gb { memory = 10000000000.B }
withLabel: mem20gb { memory = 20000000000.B }
withLabel: mem50gb { memory = 50000000000.B }
withLabel: mem100gb { memory = 100000000000.B }
withLabel: mem200gb { memory = 200000000000.B }
withLabel: mem500gb { memory = 500000000000.B }
withLabel: mem1tb { memory = 1000000000000.B }
withLabel: mem2tb { memory = 2000000000000.B }
withLabel: mem5tb { memory = 5000000000000.B }
withLabel: mem10tb { memory = 10000000000000.B }
withLabel: mem20tb { memory = 20000000000000.B }
withLabel: mem50tb { memory = 50000000000000.B }
withLabel: mem100tb { memory = 100000000000000.B }
withLabel: mem200tb { memory = 200000000000000.B }
withLabel: mem500tb { memory = 500000000000000.B }
withLabel: mem1gib { memory = 1073741824.B }
withLabel: mem2gib { memory = 2147483648.B }
withLabel: mem4gib { memory = 4294967296.B }
withLabel: mem8gib { memory = 8589934592.B }
withLabel: mem16gib { memory = 17179869184.B }
withLabel: mem32gib { memory = 34359738368.B }
withLabel: mem64gib { memory = 68719476736.B }
withLabel: mem128gib { memory = 137438953472.B }
withLabel: mem256gib { memory = 274877906944.B }
withLabel: mem512gib { memory = 549755813888.B }
withLabel: mem1tib { memory = 1099511627776.B }
withLabel: mem2tib { memory = 2199023255552.B }
withLabel: mem4tib { memory = 4398046511104.B }
withLabel: mem8tib { memory = 8796093022208.B }
withLabel: mem16tib { memory = 17592186044416.B }
withLabel: mem32tib { memory = 35184372088832.B }
withLabel: mem64tib { memory = 70368744177664.B }
withLabel: mem128tib { memory = 140737488355328.B }
withLabel: mem256tib { memory = 281474976710656.B }
withLabel: mem512tib { memory = 562949953421312.B }
withLabel: cpu1 { cpus = 1 }
withLabel: cpu2 { cpus = 2 }
withLabel: cpu5 { cpus = 5 }
withLabel: cpu10 { cpus = 10 }
withLabel: cpu20 { cpus = 20 }
withLabel: cpu50 { cpus = 50 }
withLabel: cpu100 { cpus = 100 }
withLabel: cpu200 { cpus = 200 }
withLabel: cpu500 { cpus = 500 }
withLabel: cpu1000 { cpus = 1000 }
}
includeConfig("nextflow_labels.config")

View File

@@ -0,0 +1,48 @@
process {
// Default resources for components that hardly do any processing
memory = { 2.GB * task.attempt }
cpus = 1
// Retry for exit codes that have something to do with memory issues
errorStrategy = { task.exitStatus in 137..140 ? 'retry' : 'terminate' }
maxRetries = 3
// The memory a task is assinged increases with each attempt
// uncomment the line below and adjust the value to set a global upper limit on the memory.
// resourceLimits = [ memory: 240.Gb ]
// CPU resources
withLabel: singlecpu { cpus = 1 }
withLabel: lowcpu { cpus = 4 }
withLabel: midcpu { cpus = 10 }
withLabel: highcpu { cpus = 20 }
// Memory resources
withLabel: lowmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 4.GB * task.attempt } }
withLabel: midmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 25.GB * task.attempt } }
withLabel: highmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 50.GB * task.attempt } }
withLabel: veryhighmem { memory = { task?.resourceLimits?.memory && task?.maxRetries && task.attempt >= task.maxRetries ? task.resourceLimits.memory : 75.GB * task.attempt } }
// Disk space
withLabel: lowdisk {
disk = {process.disk ? process.disk : null}
}
withLabel: middisk {
disk = {process.disk ? process.disk : null}
}
withLabel: highdisk {
disk = {process.disk ? process.disk : null}
}
withLabel: veryhighdisk {
disk = {process.disk ? process.disk : null}
}
// NOTE: The above labels intentionally do not have an effect by default.
// The user should set the disk space requirements by adding the following
// to the compute environment:
//
// withLabel: lowdisk { disk = { 20.GB * task.attempt } }
// withLabel: middisk { disk = { 100.GB * task.attempt } }
// withLabel: highdisk { disk = { 200.GB * task.attempt } }
// withLabel: veryhighdisk { disk = { 500.GB * task.attempt } }
}

Some files were not shown because too many files have changed in this diff Show More