Build branch build/main with version build_main (280b21f)
Build pipeline: openpipelines-bio.openpipeline-spatial.build-main-6h5zv
Source commit: 280b21fc21
Source message: deploy: 6840f3802d04d96d44f29d3cdbd31c62d144b14d
This commit is contained in:
10
CHANGELOG.md
10
CHANGELOG.md
@@ -2,10 +2,16 @@
|
||||
|
||||
## NEW FUNCTIONALITY
|
||||
|
||||
* `filter/subset_cosmx`: Added a component to subset COSMX data (PR #3).
|
||||
* `filter/subset_cosmx`: Added a component to subset COSMX data (PR #3, PR #9).
|
||||
|
||||
* `convert/from_cosmx_to_h5mu`: Added converter component for COSMX data (PR #3).
|
||||
* `convert/from_cosmx_to_h5mu`: Added converter component for COSMX data (PR #3, PR #9).
|
||||
|
||||
* `mapping/spaceranger_count`: Added a spaceranger count component (PR #2).
|
||||
|
||||
* `convert/from_spatialdata_to_h5mu`, `convert/from_xenium_to_spatialdata`: Added converter components for xenium data (PR #1).
|
||||
|
||||
* `convert/from_xenium_to_spatialexperiment`, `convert/from_cosmx_to_spatialexperiment`: Added converter components for Xenium or CosMx data to SpatialExperiment objects (PR #9).
|
||||
|
||||
* `workflows/qc/qc`: Added a pipeline for calculating qc metrics of spatial omics samples (PR #5).
|
||||
|
||||
* `workflows/multiomics/spatial_process_samples`: Added a pipeline to pre-process multiple spatial omics samples (PR #7).
|
||||
|
||||
10
_viash.yaml
10
_viash.yaml
@@ -1,4 +1,4 @@
|
||||
viash_version: 0.9.3
|
||||
viash_version: 0.9.4
|
||||
source: src
|
||||
target: target
|
||||
name: openpipeline_spatial
|
||||
@@ -12,15 +12,15 @@ repositories:
|
||||
type: github
|
||||
tag: 2.1.2
|
||||
- name: openpipeline_incubator
|
||||
repo: openpipelines-bio/openpipeline_incubator
|
||||
type: github
|
||||
tag: main
|
||||
repo: openpipeline_incubator
|
||||
type: vsh
|
||||
tag: build/main
|
||||
info:
|
||||
test_resources:
|
||||
- type: s3
|
||||
path: s3://openpipelines-bio/openpipeline_spatial/resources_test
|
||||
dest: resources_test
|
||||
config_mods: |-
|
||||
.resources += {path: '/src/labels.config', dest: 'nextflow_labels.config'}
|
||||
.resources += {path: '/src/workflows/utils/labels.config', dest: 'nextflow_labels.config'}
|
||||
.runners[.type == 'nextflow'].config.script := 'includeConfig("nextflow_labels.config")'
|
||||
version: build_main
|
||||
|
||||
@@ -32,14 +32,14 @@ fi
|
||||
viash run src/filter/subset_cosmx/config.vsh.yaml -- \
|
||||
--input "$OUT" \
|
||||
--num_fovs 3 \
|
||||
--dataset_id "$ID" \
|
||||
--subset_transcripts_file True \
|
||||
--subset_polygons_file False \
|
||||
--output "${DIR}/${ID}_tiny"
|
||||
|
||||
viash run src/convert/from_cosmx_to_h5mu/config.vsh.yaml -- \
|
||||
--input ${DIR}/${ID}_tiny \
|
||||
--dataset_id "$ID" \
|
||||
--output "$DIR/${ID}_tiny.h5mu" \
|
||||
--compression "gzip"
|
||||
--output_compression "gzip"
|
||||
|
||||
rm -rf "$OUT"
|
||||
|
||||
|
||||
12
src/authors/dries_schaumont.yaml
Normal file
12
src/authors/dries_schaumont.yaml
Normal file
@@ -0,0 +1,12 @@
|
||||
name: Dries Schaumont
|
||||
info:
|
||||
role: Core Team Member
|
||||
links:
|
||||
email: dries@data-intuitive.com
|
||||
github: DriesSchaumont
|
||||
orcid: "0000-0002-4389-0440"
|
||||
linkedin: dries-schaumont
|
||||
organizations:
|
||||
- name: Data Intuitive
|
||||
href: https://www.data-intuitive.com
|
||||
role: Data Scientist
|
||||
@@ -21,19 +21,6 @@ arguments:
|
||||
example: cosmx_data
|
||||
direction: input
|
||||
required: true
|
||||
- name: "--dataset_id"
|
||||
type: string
|
||||
description: |
|
||||
ID of the dataset. By default expects the following file structure:
|
||||
path/to/dataset/
|
||||
├── CellComposite/
|
||||
├── CellLabels/
|
||||
├── CellOverlay/
|
||||
├── CompartmentLabels/
|
||||
├── <dataset_id>_exprMat_file.csv
|
||||
├── <dataset_id>_fov_positions_file.csv
|
||||
├── <dataset_id>_metadata_file.csv
|
||||
└── <dataset_id>_tx_file.csv
|
||||
- name: "--modality"
|
||||
type: string
|
||||
default: rna
|
||||
|
||||
@@ -2,12 +2,12 @@ import sys
|
||||
import os
|
||||
import squidpy as sq
|
||||
import mudata as mu
|
||||
import glob
|
||||
|
||||
## VIASH START
|
||||
par = {
|
||||
"input": "./resources_test/cosmx/Lung5_Rep2_tiny",
|
||||
"output": "./resources_test/cosmx/Lung5_Rep2_tiny.h5mu",
|
||||
"dataset_id": "Lung5_Rep2",
|
||||
"modality": "rna",
|
||||
"output_compression": None,
|
||||
}
|
||||
@@ -19,14 +19,19 @@ from setup_logger import setup_logger
|
||||
|
||||
logger = setup_logger()
|
||||
|
||||
counts_file = f"{par['dataset_id']}_exprMat_file.csv"
|
||||
fov_file = f"{par['dataset_id']}_fov_positions_file.csv"
|
||||
meta_file = f"{par['dataset_id']}_metadata_file.csv"
|
||||
|
||||
for file in [counts_file, fov_file, meta_file]:
|
||||
assert os.path.isfile(os.path.join(par["input"], file)), (
|
||||
f"File does not exist: {file}"
|
||||
def find_matrix_file(suffix):
|
||||
pattern = os.path.join(par["input"], f"*{suffix}")
|
||||
files = glob.glob(pattern)
|
||||
assert len(files) == 1, (
|
||||
f"Only one file matching pattern {pattern} should be present"
|
||||
)
|
||||
return files[0]
|
||||
|
||||
|
||||
counts_file = find_matrix_file("exprMat_file.csv")
|
||||
fov_file = find_matrix_file("fov_positions_file.csv")
|
||||
meta_file = find_matrix_file("metadata_file.csv")
|
||||
|
||||
logger.info("Reading in CosMx data...")
|
||||
adata = sq.read.nanostring(
|
||||
|
||||
82
src/convert/from_cosmx_to_spatialexperiment/config.vsh.yaml
Normal file
82
src/convert/from_cosmx_to_spatialexperiment/config.vsh.yaml
Normal file
@@ -0,0 +1,82 @@
|
||||
name: "from_cosmx_to_spatialexperiment"
|
||||
namespace: "convert"
|
||||
scope: "public"
|
||||
description: |
|
||||
Creates a SpatialExperiment object from the downloaded unzipped CosMx directory for Nanostring
|
||||
CosMx spatial gene expression data, and saves it as a SpatialExperiment object.
|
||||
The constructor assumes the downloaded unzipped CosMx Folder has the following structure:
|
||||
|
||||
Mandatory files
|
||||
· | — *_exprMat_file.csv
|
||||
· | — *_metadata_file.csv
|
||||
Optional files, by default added to the metadata() as a list of paths (will be converted to parquet):
|
||||
· | —*_fov_positions_file.csv
|
||||
· | — *_tx_file.csv
|
||||
· | — *_polygons.csv
|
||||
|
||||
authors:
|
||||
- __merge__: /src/authors/dorien_roosen.yaml
|
||||
roles: [ author, maintainer ]
|
||||
arguments:
|
||||
- name: "--input"
|
||||
alternatives: ["-i"]
|
||||
type: file
|
||||
description: Input CosMx directory
|
||||
direction: input
|
||||
required: true
|
||||
example: path/to/cosmx_bundle
|
||||
- name: "--add_tx_path"
|
||||
type: boolean
|
||||
default: true
|
||||
description: |
|
||||
Whether to add parquet paths to the metadata.
|
||||
If True, `*_tx_file.csv` file will be converted to .parquet and added to the metadata.
|
||||
- name: "--add_polygon_path"
|
||||
type: boolean
|
||||
default: true
|
||||
description: |
|
||||
Whether to add polygon path to the metadata.
|
||||
If True, `*_polygons.csv` file will be converted to .parquet and be added to the metadata.
|
||||
- name: "--add_fov_positions"
|
||||
type: boolean
|
||||
default: true
|
||||
description: |
|
||||
Whether to add fov positions to the metadata.
|
||||
If True, `fov_positions_file.csv` will be added to the metadata.
|
||||
- name: "--alternative_experiment_features"
|
||||
type: string
|
||||
multiple: true
|
||||
description: Feature names containing these strings will be moved to altExps(sxe) slots as separate SpatialExperiment objects.
|
||||
default: [NegPrb, Negative, SystemControl, FalseCode]
|
||||
- name: "--output"
|
||||
alternatives: ["-o"]
|
||||
type: file
|
||||
description: Output SpatialExperiment file
|
||||
direction: output
|
||||
required: true
|
||||
example: output.rds
|
||||
resources:
|
||||
- type: r_script
|
||||
path: script.R
|
||||
test_resources:
|
||||
- type: r_script
|
||||
path: test.R
|
||||
- path: /resources_test/cosmx/Lung5_Rep2_tiny
|
||||
engines:
|
||||
- type: docker
|
||||
image: rocker/r2u:24.04
|
||||
setup:
|
||||
- type: apt
|
||||
packages:
|
||||
- libhdf5-dev
|
||||
- libgeos-dev
|
||||
- type: r
|
||||
bioc: [ SpatialExperimentIO ]
|
||||
test_setup:
|
||||
- type: r
|
||||
cran: [ testthat ]
|
||||
runners:
|
||||
- type: executable
|
||||
- type: nextflow
|
||||
directives:
|
||||
label: [lowmem, singlecpu]
|
||||
34
src/convert/from_cosmx_to_spatialexperiment/script.R
Normal file
34
src/convert/from_cosmx_to_spatialexperiment/script.R
Normal file
@@ -0,0 +1,34 @@
|
||||
library(SpatialExperimentIO)
|
||||
|
||||
### VIASH START
|
||||
par <- list(
|
||||
input = "resources_test/cosmx/Lung5_Rep2_tiny",
|
||||
add_tx_path = TRUE,
|
||||
add_polygon_path = FALSE,
|
||||
add_fov_positions = TRUE,
|
||||
alternative_experiment_features = c("NegPrb", "Negative", "SystemControl", "FalseCode"),
|
||||
output = "spe_cosmx_test.rds"
|
||||
)
|
||||
### VIASH END
|
||||
|
||||
if (par$add_polygon_path == FALSE & par$add_tx_path == FALSE) {
|
||||
add_parquet_paths <- FALSE
|
||||
} else {
|
||||
add_parquet_paths <- TRUE
|
||||
}
|
||||
|
||||
spe <- readCosmxSXE(
|
||||
dirName = par$input,
|
||||
returnType = "SPE",
|
||||
countMatPattern = "exprMat_file.csv",
|
||||
metaDataPattern = "metadata_file.csv",
|
||||
coordNames = c("CenterX_global_px", "CenterY_global_px"),
|
||||
addFovPos = par$add_fov_positions,
|
||||
fovPosPattern = "fov_positions_file.csv",
|
||||
addParquetPaths = add_parquet_paths,
|
||||
addPolygon = par$add_polygon_path,
|
||||
addTx = par$add_tx_path,
|
||||
altExps = par$alternative_experiment_features
|
||||
)
|
||||
|
||||
saveRDS(spe, file = par$output)
|
||||
107
src/convert/from_cosmx_to_spatialexperiment/test.R
Normal file
107
src/convert/from_cosmx_to_spatialexperiment/test.R
Normal file
@@ -0,0 +1,107 @@
|
||||
library(testthat, warn.conflicts = FALSE)
|
||||
library(SpatialExperimentIO)
|
||||
library(SpatialExperiment)
|
||||
|
||||
## VIASH START
|
||||
meta <- list(
|
||||
executable = "target/executable/convert/from_cosmx_to_spatialexperiment/from_cosmx_to_spatialexperiment",
|
||||
resources_dir = "resources_test/cosmx/",
|
||||
name = "from_cosmx_to_spatialexperiment"
|
||||
)
|
||||
## VIASH END
|
||||
|
||||
cat("> Checking simple execution\n")
|
||||
|
||||
spe <- paste0(
|
||||
meta[["resources_dir"]],
|
||||
"/Lung5_Rep2_tiny"
|
||||
)
|
||||
out_rds <- "output.rds"
|
||||
|
||||
cat("> Running ", meta[["name"]], "\n", sep = "")
|
||||
out <- processx::run(
|
||||
meta[["executable"]],
|
||||
c(
|
||||
"--input", spe,
|
||||
"--add_tx_path", TRUE,
|
||||
"--add_polygon_path", FALSE,
|
||||
"--output", out_rds
|
||||
)
|
||||
)
|
||||
|
||||
cat("> Checking whether output file exists\n")
|
||||
expect_equal(out$status, 0)
|
||||
expect_true(file.exists(out_rds))
|
||||
|
||||
cat("> Reading output file\n")
|
||||
obj <- readRDS(file = out_rds)
|
||||
|
||||
cat("> Checking whether Seurat object is in the right format\n")
|
||||
# Object type
|
||||
expect_is(obj, "SpatialExperiment")
|
||||
# Assay structure
|
||||
expect_equal(names(slot(obj, "assays")), "counts")
|
||||
# Spatial coordinates
|
||||
expect_equal(spatialCoordsNames(obj), c("CenterX_global_px", "CenterY_global_px"))
|
||||
# Alternative experiments
|
||||
expect_equal(altExpNames(obj), c("NegPrb"))
|
||||
# Metadata components
|
||||
expect_named(
|
||||
metadata(obj),
|
||||
c("fov_positions", "transcripts"),
|
||||
ignore.order = TRUE
|
||||
)
|
||||
# Parquet paths
|
||||
expect_true(grepl("\\.parquet$", metadata(obj)[["transcripts"]]))
|
||||
# Dimensions
|
||||
input <- readCosmxSXE(
|
||||
dirName = spe,
|
||||
addParquetPaths = FALSE,
|
||||
returnType = "SPE"
|
||||
)
|
||||
|
||||
dim_rds <- dim(obj)
|
||||
dim_input <- dim(input)
|
||||
|
||||
expect_equal(dim_rds, dim_input)
|
||||
|
||||
|
||||
cat("> Checking parameter functionality\n")
|
||||
|
||||
out_rds_ext <- "output_ext.rds"
|
||||
|
||||
cat("> Running ", meta[["name"]], "\n", sep = "")
|
||||
out_ext <- processx::run(
|
||||
meta[["executable"]],
|
||||
c(
|
||||
"--input", spe,
|
||||
"--add_fov_positions", FALSE,
|
||||
"--add_tx_path", FALSE,
|
||||
"--add_polygon_path", FALSE,
|
||||
"--alternative_experiment_features", c("Negative"),
|
||||
"--output", out_rds_ext
|
||||
)
|
||||
)
|
||||
|
||||
cat("> Checking whether output file exists\n")
|
||||
expect_equal(out_ext$status, 0)
|
||||
expect_true(file.exists(out_rds_ext))
|
||||
|
||||
cat("> Reading output file\n")
|
||||
obj_ext <- readRDS(file = out_rds_ext)
|
||||
|
||||
cat("> Checking whether Seurat object is in the right format\n")
|
||||
# Object type
|
||||
expect_is(obj_ext, "SpatialExperiment")
|
||||
# Assay structure
|
||||
expect_equal(names(slot(obj_ext, "assays")), "counts")
|
||||
# Spatial coordinates
|
||||
expect_equal(spatialCoordsNames(obj_ext), c("CenterX_global_px", "CenterY_global_px"))
|
||||
# Alternative experiments
|
||||
expect_length(altExpNames(obj_ext), 0)
|
||||
# Metadata components
|
||||
expect_length(metadata(obj_ext), 0)
|
||||
|
||||
dim_rds_ext <- dim(obj_ext)
|
||||
expect_true(identical(dim_rds_ext[2], dim_input[2]))
|
||||
expect_false(identical(dim_rds_ext[1], dim_input[1]))
|
||||
75
src/convert/from_xenium_to_spatialexperiment/config.vsh.yaml
Normal file
75
src/convert/from_xenium_to_spatialexperiment/config.vsh.yaml
Normal file
@@ -0,0 +1,75 @@
|
||||
name: "from_xenium_to_spatialexperiment"
|
||||
namespace: "convert"
|
||||
scope: "public"
|
||||
description: |
|
||||
Creates a SpatialExperiment object from the downloaded unzipped Xenium Output Bundle directory
|
||||
for 10x Genomics Xenium spatial gene expression data, and saves it as a SpatialExperiment object.
|
||||
The constructor assumes the downloaded unzipped Xenium Output Bundle has the following structure:
|
||||
|
||||
Mandatory files
|
||||
· | — cell_feature_matrix.h5
|
||||
· | — cells.parquet
|
||||
Optional files, by default added to the metadata() as a list of paths (will be converted to parquet):
|
||||
· | — transcripts.parquet
|
||||
· | — cell_boundaries.parquet
|
||||
· | — nucleus_boundaries.parquet
|
||||
· | — experiment.xenium
|
||||
|
||||
authors:
|
||||
- __merge__: /src/authors/dorien_roosen.yaml
|
||||
roles: [ author, maintainer ]
|
||||
arguments:
|
||||
- name: "--input"
|
||||
alternatives: ["-i"]
|
||||
type: file
|
||||
description: Input Xenium Output Bundle
|
||||
direction: input
|
||||
required: true
|
||||
example: path/to/xenium_bundle
|
||||
- name: "--add_experiment_xenium"
|
||||
type: boolean
|
||||
default: true
|
||||
description: Whether to add xenium.experiment parameters to the metadata.
|
||||
- name: "--add_parquet_paths"
|
||||
type: boolean
|
||||
default: true
|
||||
description: |
|
||||
Whether to add parquet paths to the metadata.
|
||||
If True, `transcripts.parquet`, `cell_boundaries.parquet`, `nucleus_boundaries.parquet` will be added to the metadata.
|
||||
- name: "--alternative_experiment_features"
|
||||
type: string
|
||||
multiple: true
|
||||
description: Feature names containing these strings will be moved to altExps(sxe) slots as separate SpatialExperiment objects.
|
||||
default: [NegControlProbe, UnassignedCodeword, NegControlCodeword, antisense, BLANK]
|
||||
- name: "--output"
|
||||
alternatives: ["-o"]
|
||||
type: file
|
||||
description: Output SpatialExperiment file
|
||||
direction: output
|
||||
required: true
|
||||
example: output.rds
|
||||
resources:
|
||||
- type: r_script
|
||||
path: script.R
|
||||
test_resources:
|
||||
- type: r_script
|
||||
path: test.R
|
||||
- path: /resources_test/xenium/xenium_tiny
|
||||
engines:
|
||||
- type: docker
|
||||
image: rocker/r2u:24.04
|
||||
setup:
|
||||
- type: apt
|
||||
packages:
|
||||
- libhdf5-dev
|
||||
- libgeos-dev
|
||||
- type: r
|
||||
bioc: [ SpatialExperimentIO ]
|
||||
test_setup:
|
||||
- type: r
|
||||
cran: [ testthat ]
|
||||
runners:
|
||||
- type: executable
|
||||
- type: nextflow
|
||||
directives:
|
||||
label: [lowmem, singlecpu]
|
||||
25
src/convert/from_xenium_to_spatialexperiment/script.R
Normal file
25
src/convert/from_xenium_to_spatialexperiment/script.R
Normal file
@@ -0,0 +1,25 @@
|
||||
library(SpatialExperimentIO)
|
||||
|
||||
### VIASH START
|
||||
par <- list(
|
||||
input = "resources_test/xenium/xenium_tiny",
|
||||
add_experiment_xenium = TRUE,
|
||||
add_parquet_paths = TRUE,
|
||||
alternative_experiment_features = c("NegControlProbe", "UnassignedCodeword", "NegControlCodeword", "antisense", "BLANK"),
|
||||
output = "spe_test.rds"
|
||||
)
|
||||
### VIASH END
|
||||
|
||||
|
||||
spe <- readXeniumSXE(
|
||||
dirName = par$input,
|
||||
returnType = "SPE",
|
||||
countMatPattern = "cell_feature_matrix.h5",
|
||||
metaDataPattern = "cells.parquet",
|
||||
coordNames = c("x_centroid", "y_centroid"),
|
||||
addExperimentXenium = par$add_experiment_xenium,
|
||||
addParquetPaths = par$add_parquet_paths,
|
||||
altExps = par$alternative_experiment_features
|
||||
)
|
||||
|
||||
saveRDS(spe, file = par$output)
|
||||
106
src/convert/from_xenium_to_spatialexperiment/test.R
Normal file
106
src/convert/from_xenium_to_spatialexperiment/test.R
Normal file
@@ -0,0 +1,106 @@
|
||||
library(testthat, warn.conflicts = FALSE)
|
||||
library(SpatialExperimentIO)
|
||||
library(SpatialExperiment)
|
||||
|
||||
## VIASH START
|
||||
meta <- list(
|
||||
executable = "target/executable/convert/from_xenium_to_spatialexperiment/from_xenium_to_spatialexperiment",
|
||||
resources_dir = "resources_test/xenium",
|
||||
name = "from_xenium_to_spatial_experiment"
|
||||
)
|
||||
## VIASH END
|
||||
|
||||
cat("> Checking simple execution\n")
|
||||
|
||||
spe <- paste0(
|
||||
meta[["resources_dir"]],
|
||||
"/xenium_tiny"
|
||||
)
|
||||
out_rds <- "output.rds"
|
||||
|
||||
cat("> Running ", meta[["name"]], "\n", sep = "")
|
||||
out <- processx::run(
|
||||
meta[["executable"]],
|
||||
c(
|
||||
"--input", spe,
|
||||
"--output", out_rds
|
||||
)
|
||||
)
|
||||
|
||||
cat("> Checking whether output file exists\n")
|
||||
expect_equal(out$status, 0)
|
||||
expect_true(file.exists(out_rds))
|
||||
|
||||
cat("> Reading output file\n")
|
||||
obj <- readRDS(file = out_rds)
|
||||
|
||||
cat("> Checking whether Seurat object is in the right format\n")
|
||||
# Object type
|
||||
expect_is(obj, "SpatialExperiment")
|
||||
# Assay structure
|
||||
expect_equal(names(slot(obj, "assays")), "counts")
|
||||
# Spatial coordinates
|
||||
expect_equal(spatialCoordsNames(obj), c("x_centroid", "y_centroid"))
|
||||
# Alternative experiments
|
||||
expect_equal(altExpNames(obj), c("NegControlProbe", "UnassignedCodeword", "NegControlCodeword"))
|
||||
# Metadata components
|
||||
metadata_components <- c("experiment.xenium", "transcripts", "cell_boundaries", "nucleus_boundaries")
|
||||
expect_named(
|
||||
metadata(obj),
|
||||
metadata_components,
|
||||
ignore.order = TRUE
|
||||
)
|
||||
# Parquet paths
|
||||
parquet_components <- c("transcripts", "cell_boundaries", "nucleus_boundaries")
|
||||
for (component in parquet_components) {
|
||||
expect_true(grepl("\\.parquet$", metadata(obj)[[component]]))
|
||||
}
|
||||
# Dimensions
|
||||
input <- readXeniumSXE(
|
||||
dirName = spe,
|
||||
returnType = "SPE"
|
||||
)
|
||||
dim_rds <- dim(obj)
|
||||
dim_input <- dim(input)
|
||||
|
||||
expect_equal(dim_rds, dim_input)
|
||||
|
||||
|
||||
cat("> Checking parameter functionality\n")
|
||||
|
||||
out_rds_ext <- "output_ext.rds"
|
||||
|
||||
cat("> Running ", meta[["name"]], "\n", sep = "")
|
||||
out_ext <- processx::run(
|
||||
meta[["executable"]],
|
||||
c(
|
||||
"--input", spe,
|
||||
"--add_experiment_xenium", FALSE,
|
||||
"--add_parquet_paths", FALSE,
|
||||
"--alternative_experiment_features", c("NegControlProbe"),
|
||||
"--output", out_rds_ext
|
||||
)
|
||||
)
|
||||
|
||||
cat("> Checking whether output file exists\n")
|
||||
expect_equal(out_ext$status, 0)
|
||||
expect_true(file.exists(out_rds_ext))
|
||||
|
||||
cat("> Reading output file\n")
|
||||
obj_ext <- readRDS(file = out_rds_ext)
|
||||
|
||||
cat("> Checking whether Seurat object is in the right format\n")
|
||||
# Object type
|
||||
expect_is(obj_ext, "SpatialExperiment")
|
||||
# Assay structure
|
||||
expect_equal(names(slot(obj_ext, "assays")), "counts")
|
||||
# Spatial coordinates
|
||||
expect_equal(spatialCoordsNames(obj_ext), c("x_centroid", "y_centroid"))
|
||||
# Alternative experiments
|
||||
expect_equal(altExpNames(obj_ext), c("NegControlProbe"))
|
||||
# Metadata components
|
||||
expect_true(length(metadata(obj_ext)) == 0)
|
||||
|
||||
dim_rds_ext <- dim(obj_ext)
|
||||
expect_true(identical(dim_rds_ext[2], dim_input[2]))
|
||||
expect_false(identical(dim_rds_ext[1], dim_input[1]))
|
||||
@@ -26,23 +26,18 @@ arguments:
|
||||
example: cosmx_data
|
||||
direction: input
|
||||
required: true
|
||||
- name: "--dataset_id"
|
||||
type: string
|
||||
description: |
|
||||
ID of the dataset. By default expects the following file structure:
|
||||
path/to/dataset/
|
||||
├── CellComposite/
|
||||
├── CellLabels/
|
||||
├── CellOverlay/
|
||||
├── CompartmentLabels/
|
||||
├── <dataset_id>_exprMat_file.csv
|
||||
├── <dataset_id>_fov_positions_file.csv
|
||||
├── <dataset_id>_metadata_file.csv
|
||||
└── <dataset_id>_tx_file.csv
|
||||
- name: "--num_fovs"
|
||||
type: integer
|
||||
required: true
|
||||
description: Number of fields of views to keep. Will keep only the first <num_fovs> fields of view.
|
||||
- name: "--subset_transcripts_file"
|
||||
type: boolean
|
||||
default: true
|
||||
description: Whether to subset the <dataset_id>_tx_file.csv file.
|
||||
- name: "--subset_polygons_file"
|
||||
type: boolean
|
||||
default: true
|
||||
description: Whether to subset the <dataset_id>_polygons.csv file.
|
||||
- name: "--output"
|
||||
alternatives: ["-o"]
|
||||
type: file
|
||||
|
||||
@@ -9,7 +9,8 @@ import sys
|
||||
par = {
|
||||
"input": "./resources_test/cosmx/Lung5_Rep2",
|
||||
"output": "./resources_test/cosmx/Lung5_Rep2_tiny/",
|
||||
"dataset_id": "Lung5_Rep2",
|
||||
"subset_transcripts_file": True,
|
||||
"subset_polygons_file": False,
|
||||
"num_fovs": 5,
|
||||
}
|
||||
meta = {"resources_dir": "src/utils"}
|
||||
@@ -21,15 +22,15 @@ from setup_logger import setup_logger
|
||||
|
||||
logger = setup_logger()
|
||||
|
||||
counts_file = f"{par['dataset_id']}_exprMat_file.csv"
|
||||
fov_file = f"{par['dataset_id']}_fov_positions_file.csv"
|
||||
meta_file = f"{par['dataset_id']}_metadata_file.csv"
|
||||
tx_file = f"{par['dataset_id']}_tx_file.csv"
|
||||
|
||||
for file in [counts_file, fov_file, meta_file]:
|
||||
assert os.path.isfile(os.path.join(par["input"], file)), (
|
||||
f"File does not exist: {file}"
|
||||
def find_matrix_file(suffix):
|
||||
pattern = os.path.join(par["input"], f"*{suffix}")
|
||||
files = glob.glob(pattern)
|
||||
assert len(files) == 1, (
|
||||
f"Only one file matching pattern {pattern} should be present"
|
||||
)
|
||||
return files[0]
|
||||
|
||||
|
||||
kept_fovs = list(range(1, par["num_fovs"] + 1))
|
||||
|
||||
@@ -49,9 +50,20 @@ for image_dir in image_dirs:
|
||||
shutil.copy2(file_path[0], os.path.join(par["output"], image_dir))
|
||||
|
||||
# Matrices
|
||||
matrices = [counts_file, fov_file, meta_file, tx_file]
|
||||
counts_file = find_matrix_file("exprMat_file.csv")
|
||||
fov_file = find_matrix_file("fov_positions_file.csv")
|
||||
meta_file = find_matrix_file("metadata_file.csv")
|
||||
|
||||
matrices = [counts_file, fov_file, meta_file]
|
||||
if par["subset_transcripts_file"]:
|
||||
tx_file = find_matrix_file("tx_file.csv")
|
||||
matrices.append(tx_file)
|
||||
if par["subset_polygons_file"]:
|
||||
polygons_file = find_matrix_file("polygons.csv")
|
||||
matrices.append(polygons_file)
|
||||
|
||||
for matrix in matrices:
|
||||
logger.info(f"Subsetting {matrix}, keeping fovs {kept_fovs}")
|
||||
data = pd.read_csv(os.path.join(par["input"], matrix))
|
||||
data = pd.read_csv(matrix)
|
||||
data_tiny = data[data["fov"].isin(kept_fovs)]
|
||||
data_tiny.to_csv(os.path.join(par["output"], matrix), index=False)
|
||||
data_tiny.to_csv(os.path.join(par["output"], os.path.basename(matrix)), index=False)
|
||||
|
||||
@@ -11,8 +11,10 @@ def test_simple_execution(run_component, tmp_path):
|
||||
[
|
||||
"--input",
|
||||
meta["resources_dir"] + "/Lung5_Rep2_tiny",
|
||||
"--dataset_id",
|
||||
dataset_id,
|
||||
"--subset_transcripts_file",
|
||||
"True",
|
||||
"--subset_polygons_file",
|
||||
"False",
|
||||
"--num_fovs",
|
||||
"2",
|
||||
"--output",
|
||||
|
||||
318
src/workflows/multiomics/spatial_process_samples/config.vsh.yaml
Normal file
318
src/workflows/multiomics/spatial_process_samples/config.vsh.yaml
Normal file
@@ -0,0 +1,318 @@
|
||||
name: "spatial_process_samples"
|
||||
namespace: "workflows/multiomics"
|
||||
scope: "public"
|
||||
description: "A pipeline to pre-process multiple spatial omics samples."
|
||||
authors:
|
||||
- __merge__: /src/authors/dries_schaumont.yaml
|
||||
roles: [ author, maintainer ]
|
||||
- __merge__: /src/authors/dorien_roosen.yaml
|
||||
roles: [ contributor ]
|
||||
- __merge__: /src/authors/weiwei_schultz.yaml
|
||||
roles: [ contributor ]
|
||||
|
||||
argument_groups:
|
||||
- name: Inputs
|
||||
arguments:
|
||||
- name: "--id"
|
||||
required: true
|
||||
type: string
|
||||
description: ID of the sample.
|
||||
example: foo
|
||||
- name: "--input"
|
||||
alternatives: [-i]
|
||||
description: Path to the sample.
|
||||
required: true
|
||||
example: input.h5mu
|
||||
type: file
|
||||
- name: "--rna_layer"
|
||||
type: string
|
||||
description: "Input layer for the gene expression modality. If not specified, .X is used."
|
||||
required: false
|
||||
- name: "--prot_layer"
|
||||
type: string
|
||||
description: "Input layer for the antibody capture modality. If not specified, .X is used."
|
||||
required: false
|
||||
|
||||
- name: "Outputs"
|
||||
arguments:
|
||||
- name: "--output"
|
||||
type: file
|
||||
required: true
|
||||
direction: output
|
||||
description: Destination path to the output.
|
||||
example: output.h5mu
|
||||
|
||||
- name: "Sample ID options"
|
||||
description: |
|
||||
Options for adding the id to .obs on the MuData object. Having a sample
|
||||
id present in a requirement of several components for this pipeline.
|
||||
arguments:
|
||||
- name: "--add_id_to_obs"
|
||||
description: "Add the value passed with --id to .obs."
|
||||
type: boolean
|
||||
default: true
|
||||
- name: --add_id_obs_output
|
||||
description: |
|
||||
.Obs column to add the sample IDs to. Required and only used when
|
||||
--add_id_to_obs is set to 'true'
|
||||
type: string
|
||||
default: "sample_id"
|
||||
- name: "--add_id_make_observation_keys_unique"
|
||||
type: boolean
|
||||
description: |
|
||||
Join the id to the .obs index (.obs_names).
|
||||
Only used when --add_id_to_obs is set to 'true'.
|
||||
default: true
|
||||
|
||||
- name: "RNA filtering options"
|
||||
arguments:
|
||||
- name: "--rna_min_counts"
|
||||
example: 200
|
||||
min: 1
|
||||
type: integer
|
||||
description: Minimum number of counts captured per cell.
|
||||
- name: "--rna_max_counts"
|
||||
example: 5000000
|
||||
min: 1
|
||||
type: integer
|
||||
description: Maximum number of counts captured per cell.
|
||||
- name: "--rna_min_genes_per_cell"
|
||||
type: integer
|
||||
min: 1
|
||||
example: 200
|
||||
description: Minimum of non-zero values per cell.
|
||||
- name: "--rna_max_genes_per_cell"
|
||||
example: 1500000
|
||||
min: 1
|
||||
type: integer
|
||||
description: Maximum of non-zero values per cell.
|
||||
- name: "--rna_min_cells_per_gene"
|
||||
example: 3
|
||||
min: 1
|
||||
type: integer
|
||||
description: Minimum of non-zero values per gene.
|
||||
- name: "--rna_min_fraction_mito"
|
||||
example: 0
|
||||
min: 0
|
||||
max: 1
|
||||
type: double
|
||||
description: Minimum fraction of UMIs that are mitochondrial.
|
||||
- name: "--rna_max_fraction_mito"
|
||||
type: double
|
||||
min: 0
|
||||
max: 1
|
||||
example: 0.2
|
||||
description: Maximum fraction of UMIs that are mitochondrial.
|
||||
- name: "--rna_min_fraction_ribo"
|
||||
example: 0
|
||||
min: 0
|
||||
max: 1
|
||||
type: double
|
||||
description: Minimum fraction of UMIs that are mitochondrial.
|
||||
- name: "--rna_max_fraction_ribo"
|
||||
type: double
|
||||
min: 0
|
||||
max: 1
|
||||
example: 0.2
|
||||
description: Maximum fraction of UMIs that are mitochondrial.
|
||||
|
||||
- name: "Protein filtering options"
|
||||
arguments:
|
||||
- name: "--prot_min_counts"
|
||||
description: Minimum number of counts per cell.
|
||||
type: integer
|
||||
min: 1
|
||||
example: 3
|
||||
- name: "--prot_max_counts"
|
||||
description: Minimum number of counts per cell.
|
||||
type: integer
|
||||
min: 1
|
||||
example: 5000000
|
||||
- name: "--prot_min_proteins_per_cell"
|
||||
type: integer
|
||||
min: 1
|
||||
example: 200
|
||||
description: Minimum of non-zero values per cell.
|
||||
- name: "--prot_max_proteins_per_cell"
|
||||
description: Maximum of non-zero values per cell.
|
||||
type: integer
|
||||
min: 1
|
||||
example: 100000000
|
||||
- name: "--prot_min_cells_per_protein"
|
||||
example: 3
|
||||
min: 1
|
||||
type: integer
|
||||
description: Minimum of non-zero values per protein.
|
||||
|
||||
- name: "Highly variable features detection"
|
||||
arguments:
|
||||
- name: "--highly_variable_features_var_output"
|
||||
alternatives: ["--filter_with_hvg_var_output"]
|
||||
required: false
|
||||
type: string
|
||||
default: "filter_with_hvg"
|
||||
description: In which .var slot to store a boolean array corresponding to the highly variable genes.
|
||||
- name: "--highly_variable_features_obs_batch_key"
|
||||
alternatives: ["--filter_with_hvg_obs_batch_key"]
|
||||
type: string
|
||||
default: "sample_id"
|
||||
required: false
|
||||
description: |
|
||||
If specified, highly-variable genes are selected within each batch separately and merged. This simple
|
||||
process avoids the selection of batch-specific genes and acts as a lightweight batch correction method.
|
||||
- name: "Mitochondrial & Ribosomal Gene Detection"
|
||||
arguments:
|
||||
- name: "--var_gene_names"
|
||||
required: false
|
||||
example: "gene_symbol"
|
||||
type: string
|
||||
description: |
|
||||
.var column name to be used to detect mitochondrial/ribosomal genes instead of .var_names (default if not set).
|
||||
Gene names matching with the regex value from --mitochondrial_gene_regex or --ribosomal_gene_regex will be
|
||||
identified as mitochondrial or ribosomal genes, respectively.
|
||||
- name: "--var_name_mitochondrial_genes"
|
||||
type: string
|
||||
required: false
|
||||
description: |
|
||||
In which .var slot to store a boolean array corresponding the mitochondrial genes.
|
||||
- name: "--obs_name_mitochondrial_fraction"
|
||||
type: string
|
||||
required: false
|
||||
description: |
|
||||
When specified, write the fraction of counts originating from mitochondrial genes
|
||||
(based on --mitochondrial_gene_regex) to an .obs column with the specified name.
|
||||
Requires --var_name_mitochondrial_genes.
|
||||
- name: --mitochondrial_gene_regex
|
||||
type: string
|
||||
description: |
|
||||
Regex string that identifies mitochondrial genes from --var_gene_names.
|
||||
By default will detect human and mouse mitochondrial genes from a gene symbol.
|
||||
required: false
|
||||
default: "^[mM][tT]-"
|
||||
- name: "--var_name_ribosomal_genes"
|
||||
type: string
|
||||
required: false
|
||||
description: |
|
||||
In which .var slot to store a boolean array corresponding the ribosomal genes.
|
||||
- name: "--obs_name_ribosomal_fraction"
|
||||
type: string
|
||||
required: false
|
||||
description: |
|
||||
When specified, write the fraction of counts originating from ribosomal genes
|
||||
(based on --ribosomal_gene_regex) to an .obs column with the specified name.
|
||||
Requires --var_name_ribosomal_genes.
|
||||
- name: --ribosomal_gene_regex
|
||||
type: string
|
||||
description: |
|
||||
Regex string that identifies ribosomal genes from --var_gene_names.
|
||||
By default will detect human and mouse ribosomal genes from a gene symbol.
|
||||
required: false
|
||||
default: "^[Mm]?[Rr][Pp][LlSs]"
|
||||
|
||||
- name: "QC metrics calculation options"
|
||||
arguments:
|
||||
- name: "--var_qc_metrics"
|
||||
description: |
|
||||
Keys to select a boolean (containing only True or False) column from .var.
|
||||
For each cell, calculate the proportion of total values for genes which are labeled 'True',
|
||||
compared to the total sum of the values for all genes. Defaults to the combined values specified for
|
||||
--var_name_mitochondrial_genes and --highly_variable_features_var_output.
|
||||
type: string
|
||||
multiple: True
|
||||
multiple_sep: ','
|
||||
required: false
|
||||
example: "ercc,highly_variable"
|
||||
- name: "--top_n_vars"
|
||||
type: integer
|
||||
description: |
|
||||
Number of top vars to be used to calculate cumulative proportions.
|
||||
If not specified, proportions are not calculated. `--top_n_vars 20,50` finds
|
||||
cumulative proportion to the 20th and 50th most expressed vars.
|
||||
multiple: true
|
||||
multiple_sep: ','
|
||||
required: false
|
||||
default: [50, 100, 200, 500]
|
||||
|
||||
- name: "PCA options"
|
||||
arguments:
|
||||
- name: "--pca_overwrite"
|
||||
type: boolean_true
|
||||
description: "Allow overwriting slots for PCA output."
|
||||
|
||||
- name: "CLR options"
|
||||
arguments:
|
||||
- name: "--clr_axis"
|
||||
type: integer
|
||||
description: "Axis to perform the CLR transformation on."
|
||||
default: 0
|
||||
required: false
|
||||
|
||||
- name: "RNA Scaling options"
|
||||
description: |
|
||||
Options for enabling scaling of the log-normalized data to unit variance and zero mean.
|
||||
The scaled data will be output a different layer and representation with reduced dimensions
|
||||
will be created and stored in addition to the non-scaled data.
|
||||
arguments:
|
||||
- name: "--rna_enable_scaling"
|
||||
description: "Enable scaling for the RNA modality."
|
||||
type: boolean_true
|
||||
- name: "--rna_scaling_output_layer"
|
||||
type: string
|
||||
default: "scaled"
|
||||
description: "Output layer where the scaled log-normalized data will be stored."
|
||||
- name: "--rna_scaling_pca_obsm_output"
|
||||
type: string
|
||||
description: |
|
||||
Name of the .obsm key where the PCA representation of the log-normalized
|
||||
and scaled data is stored.
|
||||
default: "scaled_pca"
|
||||
- name: "--rna_scaling_pca_loadings_varm_output"
|
||||
type: string
|
||||
description: |
|
||||
Name of the .varm key where the PCA loadings of the log-normalized and scaled
|
||||
data is stored.
|
||||
default: "scaled_pca_loadings"
|
||||
- name: "--rna_scaling_pca_variance_uns_output"
|
||||
type: string
|
||||
description: |
|
||||
Name of the .uns key where the variance and variance ratio will be stored as a map.
|
||||
The map will contain two keys: variance and variance_ratio respectively.
|
||||
default: "scaled_pca_variance"
|
||||
- name: "--rna_scaling_umap_obsm_output"
|
||||
type: string
|
||||
description:
|
||||
Name of the .obsm key where the UMAP representation of the log-normalized
|
||||
and scaled data is stored.
|
||||
default: "scaled_umap"
|
||||
- name: "--rna_scaling_max_value"
|
||||
description: "Clip (truncate) data to this value after scaling. If not specified, do not clip."
|
||||
required: false
|
||||
type: double
|
||||
- name: "--rna_scaling_zero_center"
|
||||
type: boolean_false
|
||||
description: If set, omit zero-centering variables, which allows to handle sparse input efficiently."
|
||||
|
||||
dependencies:
|
||||
- name: workflows/multiomics/process_samples
|
||||
alias: spatial_sample_processing
|
||||
repository: openpipeline_scrublet
|
||||
|
||||
repositories:
|
||||
- name: openpipeline_scrublet
|
||||
repo: openpipelines-bio/openpipeline
|
||||
type: github
|
||||
tag: disable-scrublet_build
|
||||
|
||||
resources:
|
||||
- type: nextflow_script
|
||||
path: main.nf
|
||||
entrypoint: run_wf
|
||||
|
||||
test_resources:
|
||||
- type: nextflow_script
|
||||
path: test.nf
|
||||
entrypoint: test_wf
|
||||
- path: /resources_test/xenium/xenium_tiny.h5mu
|
||||
|
||||
runners:
|
||||
- type: nextflow
|
||||
17
src/workflows/multiomics/spatial_process_samples/integration_test.sh
Executable file
17
src/workflows/multiomics/spatial_process_samples/integration_test.sh
Executable file
@@ -0,0 +1,17 @@
|
||||
#!/bin/bash
|
||||
|
||||
set -eo pipefail
|
||||
|
||||
# get the root of the directory
|
||||
REPO_ROOT=$(git rev-parse --show-toplevel)
|
||||
|
||||
# ensure that the command below is run from the root of the repository
|
||||
cd "$REPO_ROOT"
|
||||
|
||||
nextflow \
|
||||
run . \
|
||||
-main-script src/workflows/multiomics/spatial_process_samples/test.nf \
|
||||
-entry test_wf \
|
||||
-profile docker,no_publish \
|
||||
-c src/workflows/utils/labels_ci.config \
|
||||
-c src/workflows/utils/integration_tests.config
|
||||
77
src/workflows/multiomics/spatial_process_samples/main.nf
Normal file
77
src/workflows/multiomics/spatial_process_samples/main.nf
Normal file
@@ -0,0 +1,77 @@
|
||||
workflow run_wf {
|
||||
take:
|
||||
input_ch
|
||||
|
||||
main:
|
||||
output_ch = input_ch
|
||||
| map { id, state ->
|
||||
def new_state = [
|
||||
state.id,
|
||||
state + ["_meta": ["join_id": id], "workflow_output": state.output]
|
||||
]
|
||||
new_state
|
||||
}
|
||||
| spatial_sample_processing.run(
|
||||
fromState: { id, state -> [
|
||||
"id": id,
|
||||
"input": state.input,
|
||||
"rna_layer": state.rna_layer,
|
||||
"prot_layer": state.prot_layer,
|
||||
"add_id_to_obs": state.add_id_to_obs,
|
||||
"add_id_obs_output": state.add_id_obs_output,
|
||||
"add_id_make_observation_keys_unique": state.add_id_make_observation_keys_unique,
|
||||
"rna_min_counts": state.rna_min_counts,
|
||||
"rna_max_counts": state.rna_max_counts,
|
||||
"rna_min_genes_per_cell": state.rna_min_genes_per_cell,
|
||||
"rna_max_genes_per_cell": state.rna_max_genes_per_cell,
|
||||
"rna_min_cells_per_gene": state.rna_min_cells_per_gene,
|
||||
"rna_min_fraction_mito": state.rna_min_fraction_mito,
|
||||
"rna_max_fraction_mito": state.rna_max_fraction_mito,
|
||||
"rna_min_fraction_ribo": state.rna_min_fraction_ribo,
|
||||
"rna_max_fraction_ribo": state.rna_max_fraction_ribo,
|
||||
"prot_min_counts": state.prot_min_counts,
|
||||
"prot_max_counts": state.prot_max_counts,
|
||||
"prot_min_proteins_per_cell": state.prot_min_proteins_per_cell,
|
||||
"prot_max_proteins_per_cell": state.prot_max_proteins_per_cell,
|
||||
"prot_min_cells_per_protein": state.prot_min_cells_per_protein,
|
||||
"highly_variable_features_var_output": state.highly_variable_features_var_output,
|
||||
"highly_variable_features_obs_batch_key": state.highly_variable_features_obs_batch_key,
|
||||
"var_gene_names": state.var_gene_names,
|
||||
"var_name_mitochondrial_genes": state.var_name_mitochondrial_genes,
|
||||
"obs_name_mitochondrial_fraction": state.obs_name_mitochondrial_fraction,
|
||||
"mitochondrial_gene_regex": state.mitochondrial_gene_regex,
|
||||
"var_name_ribosomal_genes": state.var_name_ribosomal_genes,
|
||||
"obs_name_ribosomal_fraction": state.obs_name_ribosomal_fraction,
|
||||
"ribosomal_gene_regex": state.ribosomal_gene_regex,
|
||||
"var_qc_metrics": state.var_qc_metrics,
|
||||
"top_n_vars": state.top_n_vars,
|
||||
"pca_overwrite": state.pca_overwrite,
|
||||
"clr_axis": state.clr_axis,
|
||||
"rna_enable_scaling": state.rna_enable_scaling,
|
||||
"rna_scaling_output_layer": state.rna_scaling_output_layer,
|
||||
"rna_scaling_pca_obsm_output": state.rna_scaling_pca_obsm_output,
|
||||
"rna_scaling_pca_loadings_varm_output": state.rna_scaling_pca_loadings_varm_output,
|
||||
"rna_scaling_pca_variance_uns_output": state.rna_scaling_pca_variance_uns_output,
|
||||
"rna_scaling_umap_obsm_output": state.rna_scaling_umap_obsm_output,
|
||||
"rna_scaling_max_value": state.rna_scaling_max_value,
|
||||
"rna_scaling_zero_center": state.rna_scaling_zero_center,
|
||||
"output": state.workflow_output
|
||||
]},
|
||||
args: [
|
||||
"skip_scrublet_filtering": "true",
|
||||
],
|
||||
toState: [
|
||||
"output": "output"
|
||||
]
|
||||
)
|
||||
|
||||
| setState(
|
||||
[
|
||||
"_meta": "_meta",
|
||||
"output": "output"
|
||||
]
|
||||
)
|
||||
|
||||
emit:
|
||||
output_ch
|
||||
}
|
||||
@@ -0,0 +1,10 @@
|
||||
manifest {
|
||||
nextflowVersion = '!>=20.12.1-edge'
|
||||
}
|
||||
|
||||
params {
|
||||
rootDir = java.nio.file.Paths.get("$projectDir/../../../../").toAbsolutePath().normalize().toString()
|
||||
}
|
||||
|
||||
// include common settings
|
||||
includeConfig("${params.rootDir}/src/workflows/utils/labels.config")
|
||||
33
src/workflows/multiomics/spatial_process_samples/test.nf
Normal file
33
src/workflows/multiomics/spatial_process_samples/test.nf
Normal file
@@ -0,0 +1,33 @@
|
||||
nextflow.enable.dsl=2
|
||||
targetDir = params.rootDir + "/target/nextflow"
|
||||
|
||||
include { spatial_process_samples } from targetDir + "/workflows/multiomics/spatial_process_samples/main.nf"
|
||||
|
||||
params.resources_test = params.rootDir + "/resources_test"
|
||||
|
||||
workflow test_wf {
|
||||
|
||||
resources_test = file(params.resources_test)
|
||||
|
||||
output_ch = Channel.fromList([
|
||||
[
|
||||
id: "xenium",
|
||||
input: resources_test.resolve("xenium/xenium_tiny.h5mu"),
|
||||
publish_dir: "foo/",
|
||||
output: "test.h5mu",
|
||||
]
|
||||
])
|
||||
| map{ state -> [state.id, state] }
|
||||
| spatial_process_samples
|
||||
| view { output ->
|
||||
assert output.size() == 2 : "outputs should contain two elements; [id, file]"
|
||||
assert output[1].output.toString().endsWith("test.h5mu") : "Output file should be a h5mu file. Found: ${output[1].output}"
|
||||
"Output: $output"
|
||||
}
|
||||
| toSortedList()
|
||||
| map { output_list ->
|
||||
assert output_list.size() == 1 : "output channel should contain one event"
|
||||
assert output_list[0][0] == "merged" : "Output ID should be 'merged'"
|
||||
}
|
||||
|
||||
}
|
||||
174
src/workflows/qc/spatial_qc/config.vsh.yaml
Normal file
174
src/workflows/qc/spatial_qc/config.vsh.yaml
Normal file
@@ -0,0 +1,174 @@
|
||||
name: "spatial_qc"
|
||||
namespace: "workflows/qc"
|
||||
scope: "public"
|
||||
description: "A pipeline to add basic qc statistics to a MuData containing spatial data."
|
||||
authors:
|
||||
- __merge__: /src/authors/dries_schaumont.yaml
|
||||
roles: [ author, maintainer ]
|
||||
- __merge__: /src/authors/dorien_roosen.yaml
|
||||
roles: [ contributor ]
|
||||
- __merge__: /src/authors/weiwei_schultz.yaml
|
||||
roles: [ contributor ]
|
||||
info:
|
||||
test_dependencies:
|
||||
- name: qc_test
|
||||
namespace: test_workflows/qc
|
||||
argument_groups:
|
||||
- name: Inputs
|
||||
arguments:
|
||||
- name: "--id"
|
||||
required: true
|
||||
type: string
|
||||
description: ID of the sample.
|
||||
example: foo
|
||||
- name: "--input"
|
||||
alternatives: [-i]
|
||||
description: Path to the sample.
|
||||
required: true
|
||||
example: input.h5mu
|
||||
type: file
|
||||
- name: "--modality"
|
||||
description: Which modality to process.
|
||||
type: string
|
||||
default: "rna"
|
||||
required: false
|
||||
- name: "--layer"
|
||||
description: "Use specified layer for calculation of qc metrics. If not specified, adata.X is used."
|
||||
type: string
|
||||
example: "raw_counts"
|
||||
required: false
|
||||
- name: "Mitochondrial & Ribosomal Gene Detection"
|
||||
arguments:
|
||||
- name: "--var_gene_names"
|
||||
required: false
|
||||
example: "gene_symbol"
|
||||
type: string
|
||||
description: |
|
||||
.var column name to be used to detect mitochondrial/ribosomal genes instead of .var_names (default if not set).
|
||||
Gene names matching with the regex value from --mitochondrial_gene_regex or --ribosomal_gene_regex will be
|
||||
identified as mitochondrial or ribosomal genes, respectively.
|
||||
- name: "--var_name_mitochondrial_genes"
|
||||
type: string
|
||||
required: false
|
||||
description: |
|
||||
In which .var slot to store a boolean array corresponding the mitochondrial genes.
|
||||
- name: "--obs_name_mitochondrial_fraction"
|
||||
type: string
|
||||
required: false
|
||||
description: |
|
||||
.Obs slot to store the fraction of reads found to be mitochondrial. Defaults to 'fraction_' suffixed by the value of --var_name_mitochondrial_genes
|
||||
- name: --mitochondrial_gene_regex
|
||||
type: string
|
||||
description: |
|
||||
Regex string that identifies mitochondrial genes from --var_gene_names.
|
||||
By default will detect human and mouse mitochondrial genes from a gene symbol.
|
||||
required: false
|
||||
default: "^[mM][tT]-"
|
||||
- name: "--var_name_ribosomal_genes"
|
||||
type: string
|
||||
required: false
|
||||
description: |
|
||||
In which .var slot to store a boolean array corresponding the ribosomal genes.
|
||||
- name: "--obs_name_ribosomal_fraction"
|
||||
type: string
|
||||
required: false
|
||||
description: |
|
||||
When specified, write the fraction of counts originating from ribosomal genes
|
||||
(based on --ribosomal_gene_regex) to an .obs column with the specified name.
|
||||
Requires --var_name_ribosomal_genes.
|
||||
- name: --ribosomal_gene_regex
|
||||
type: string
|
||||
description: |
|
||||
Regex string that identifies ribosomal genes from --var_gene_names.
|
||||
By default will detect human and mouse ribosomal genes from a gene symbol.
|
||||
required: false
|
||||
default: "^[Mm]?[Rr][Pp][LlSs]"
|
||||
- name: "QC metrics calculation options"
|
||||
arguments:
|
||||
- name: "--var_qc_metrics"
|
||||
description: |
|
||||
Keys to select a boolean (containing only True or False) column from .var.
|
||||
For each cell, calculate the proportion of total values for genes which are labeled 'True',
|
||||
compared to the total sum of the values for all genes. Defaults to the value from
|
||||
--var_name_mitochondrial_genes.
|
||||
type: string
|
||||
multiple: True
|
||||
multiple_sep: ','
|
||||
required: false
|
||||
example: "ercc,highly_variable"
|
||||
- name: "--top_n_vars"
|
||||
type: integer
|
||||
description: |
|
||||
Number of top vars to be used to calculate cumulative proportions.
|
||||
If not specified, proportions are not calculated. `--top_n_vars 20,50` finds
|
||||
cumulative proportion to the 20th and 50th most expressed vars.
|
||||
multiple: true
|
||||
multiple_sep: ','
|
||||
required: false
|
||||
default: [50, 100, 200, 500]
|
||||
- name: "--output_obs_num_nonzero_vars"
|
||||
description: |
|
||||
Name of column in .obs describing, for each observation, the number of stored values
|
||||
(including explicit zeroes). In other words, the name of the column that counts
|
||||
for each row the number of columns that contain data.
|
||||
type: string
|
||||
required: false
|
||||
default: "num_nonzero_vars"
|
||||
- name: "--output_obs_total_counts_vars"
|
||||
description: |
|
||||
Name of the column for .obs describing, for each observation (row),
|
||||
the sum of the stored values in the columns.
|
||||
type: string
|
||||
required: false
|
||||
default: total_counts
|
||||
- name: "--output_var_num_nonzero_obs"
|
||||
description: |
|
||||
Name of column describing, for each feature, the number of stored values
|
||||
(including explicit zeroes). In other words, the name of the column that counts
|
||||
for each column the number of rows that contain data.
|
||||
type: string
|
||||
required: false
|
||||
default: "num_nonzero_obs"
|
||||
- name: "--output_var_total_counts_obs"
|
||||
description: |
|
||||
Name of the column in .var describing, for each feature (column),
|
||||
the sum of the stored values in the rows.
|
||||
type: string
|
||||
required: false
|
||||
default: total_counts
|
||||
- name: "--output_var_obs_mean"
|
||||
type: string
|
||||
description: |
|
||||
Name of the column in .obs providing the mean of the values in each row.
|
||||
default: "obs_mean"
|
||||
required: false
|
||||
- name: "--output_var_pct_dropout"
|
||||
type: string
|
||||
default: "pct_dropout"
|
||||
description: |
|
||||
Name of the column in .obs providing for each feature the percentage of
|
||||
observations the feature does not appear on (i.e. is missing). Same as `--output_var_num_nonzero_obs`
|
||||
but percentage based.
|
||||
- name: "Outputs"
|
||||
arguments:
|
||||
- name: "--output"
|
||||
type: file
|
||||
required: true
|
||||
direction: output
|
||||
description: Destination path to the output.
|
||||
example: output.h5mu
|
||||
dependencies:
|
||||
- name: workflows/qc/qc
|
||||
alias: spatial_qc_workflow
|
||||
repository: openpipeline
|
||||
resources:
|
||||
- type: nextflow_script
|
||||
path: main.nf
|
||||
entrypoint: run_wf
|
||||
test_resources:
|
||||
- type: nextflow_script
|
||||
path: test.nf
|
||||
entrypoint: test_wf
|
||||
- path: /resources_test/xenium/xenium_tiny.h5mu
|
||||
runners:
|
||||
- type: nextflow
|
||||
15
src/workflows/qc/spatial_qc/integration_test.sh
Normal file
15
src/workflows/qc/spatial_qc/integration_test.sh
Normal file
@@ -0,0 +1,15 @@
|
||||
#!/bin/bash
|
||||
|
||||
# get the root of the directory
|
||||
REPO_ROOT=$(git rev-parse --show-toplevel)
|
||||
|
||||
# ensure that the command below is run from the root of the repository
|
||||
cd "$REPO_ROOT"
|
||||
|
||||
nextflow \
|
||||
run . \
|
||||
-main-script src/workflows/qc/spatial_qc/test.nf \
|
||||
-entry test_wf \
|
||||
-profile docker,no_publish \
|
||||
-c src/workflows/utils/labels_ci.config \
|
||||
-c src/workflows/utils/integration_tests.config
|
||||
38
src/workflows/qc/spatial_qc/main.nf
Normal file
38
src/workflows/qc/spatial_qc/main.nf
Normal file
@@ -0,0 +1,38 @@
|
||||
workflow run_wf {
|
||||
take:
|
||||
input_ch
|
||||
|
||||
main:
|
||||
output_ch = input_ch
|
||||
| spatial_qc_workflow.run(
|
||||
fromState: { id, state -> [
|
||||
"id": id,
|
||||
"input": state.input,
|
||||
"modality": state.modality,
|
||||
"layer": state.layer,
|
||||
"var_gene_names": state.var_gene_names,
|
||||
"var_name_mitochondrial_genes": state.var_name_mitochondrial_genes,
|
||||
"obs_name_mitochondrial_fraction": state.obs_name_mitochondrial_fraction,
|
||||
"mitochondrial_gene_regex": state.mitochondrial_gene_regex,
|
||||
"var_name_ribosomal_genes": state.var_name_ribosomal_genes,
|
||||
"obs_name_ribosomal_fraction": state.obs_name_ribosomal_fraction,
|
||||
"ribosomal_gene_regex": state.ribosomal_gene_regex,
|
||||
"var_qc_metrics": state.var_qc_metrics,
|
||||
"top_n_vars": state.top_n_vars,
|
||||
"output_obs_num_nonzero_vars": state.output_obs_num_nonzero_vars,
|
||||
"output_obs_total_counts_vars": state.output_obs_total_counts_vars,
|
||||
"output_var_num_nonzero_obs": state.output_var_num_nonzero_obs,
|
||||
"output_var_total_counts_obs": state.output_var_total_counts_obs,
|
||||
"output_var_obs_mean": state.output_var_obs_mean,
|
||||
"output_var_pct_dropout": state.output_var_pct_dropout
|
||||
]},
|
||||
toState: [
|
||||
"output": "output"
|
||||
]
|
||||
)
|
||||
|
||||
| setState(["output"])
|
||||
|
||||
emit:
|
||||
output_ch
|
||||
}
|
||||
10
src/workflows/qc/spatial_qc/nextflow.config
Normal file
10
src/workflows/qc/spatial_qc/nextflow.config
Normal file
@@ -0,0 +1,10 @@
|
||||
manifest {
|
||||
nextflowVersion = '!>=20.12.1-edge'
|
||||
}
|
||||
|
||||
params {
|
||||
rootDir = java.nio.file.Paths.get("$projectDir/../../../../").toAbsolutePath().normalize().toString()
|
||||
}
|
||||
|
||||
// include common settings
|
||||
includeConfig("${params.rootDir}/src/workflows/utils/labels.config")
|
||||
40
src/workflows/qc/spatial_qc/test.nf
Normal file
40
src/workflows/qc/spatial_qc/test.nf
Normal file
@@ -0,0 +1,40 @@
|
||||
nextflow.enable.dsl=2
|
||||
|
||||
include { spatial_qc } from params.rootDir + "/target/nextflow/workflows/qc/spatial_qc/main.nf"
|
||||
|
||||
params.resources_test = params.rootDir + "/resources_test"
|
||||
|
||||
workflow test_wf {
|
||||
|
||||
resources_test = file(params.resources_test)
|
||||
|
||||
output_ch =
|
||||
Channel.fromList([
|
||||
[
|
||||
id: "xenium_test",
|
||||
input: resources_test.resolve("xenium/xenium_tiny.h5mu"),
|
||||
var_name_mitochondrial_genes: "mitochondrial",
|
||||
var_name_ribosomal_genes: "ribosomal",
|
||||
]
|
||||
])
|
||||
| map { state -> [state.id, state] }
|
||||
| spatial_qc.run(
|
||||
toState: { id, output, state -> output + [og_input: state.input] }
|
||||
)
|
||||
|
||||
| view { output ->
|
||||
assert output.size() == 2 : "Outputs should contain two elements; [id, state]"
|
||||
|
||||
// check id
|
||||
def id = output[0]
|
||||
assert id.endsWith("_test")
|
||||
|
||||
// check output
|
||||
def state = output[1]
|
||||
assert state instanceof Map : "State should be a map. Found: ${state}"
|
||||
assert state.containsKey("output") : "Output should contain key 'output'."
|
||||
assert state.output.isFile() : "'output' should be a file."
|
||||
assert state.output.toString().endsWith(".h5mu") : "Output file should end with '.h5mu'. Found: ${state.output}"
|
||||
|
||||
}
|
||||
}
|
||||
36
src/workflows/utils/integration_tests.config
Normal file
36
src/workflows/utils/integration_tests.config
Normal file
@@ -0,0 +1,36 @@
|
||||
profiles {
|
||||
|
||||
// detect tempdir
|
||||
tempDir = java.nio.file.Paths.get(
|
||||
System.getenv('NXF_TEMP') ?:
|
||||
System.getenv('VIASH_TEMP') ?:
|
||||
System.getenv('TEMPDIR') ?:
|
||||
System.getenv('TMPDIR') ?:
|
||||
'/tmp'
|
||||
).toAbsolutePath()
|
||||
|
||||
mount_temp {
|
||||
docker.temp = tempDir
|
||||
podman.temp = tempDir
|
||||
charliecloud.temp = tempDir
|
||||
}
|
||||
|
||||
no_publish {
|
||||
process {
|
||||
withName: '.*' {
|
||||
publishDir = [
|
||||
enabled: false
|
||||
]
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
docker {
|
||||
docker.enabled = true
|
||||
// docker.userEmulation = true
|
||||
singularity.enabled = false
|
||||
podman.enabled = false
|
||||
shifter.enabled = false
|
||||
charliecloud.enabled = false
|
||||
}
|
||||
}
|
||||
105
src/workflows/utils/labels_ci.config
Normal file
105
src/workflows/utils/labels_ci.config
Normal file
@@ -0,0 +1,105 @@
|
||||
process {
|
||||
withLabel: lowmem { memory = 13.Gb }
|
||||
withLabel: lowcpu { cpus = 4 }
|
||||
withLabel: midmem { memory = 13.Gb }
|
||||
withLabel: midcpu { cpus = 4 }
|
||||
withLabel: highmem { memory = 13.Gb }
|
||||
withLabel: highcpu { cpus = 4 }
|
||||
withLabel: veryhighmem { memory = 13.Gb }
|
||||
withLabel: lowdisk {
|
||||
disk = {process.disk ? process.disk : null}
|
||||
}
|
||||
withLabel: middisk {
|
||||
disk = {process.disk ? process.disk : null}
|
||||
}
|
||||
withLabel: highdisk {
|
||||
disk = {process.disk ? process.disk : null}
|
||||
}
|
||||
withLabel: veryhighdisk {
|
||||
disk = {process.disk ? process.disk : null}
|
||||
}
|
||||
}
|
||||
|
||||
env.NUMBA_CACHE_DIR = '/tmp'
|
||||
|
||||
trace {
|
||||
enabled = true
|
||||
overwrite = true
|
||||
}
|
||||
dag {
|
||||
overwrite = true
|
||||
}
|
||||
|
||||
process.maxForks = 1
|
||||
|
||||
profiles {
|
||||
// detect tempdir
|
||||
tempDir = java.nio.file.Paths.get(
|
||||
System.getenv('NXF_TEMP') ?:
|
||||
System.getenv('VIASH_TEMP') ?:
|
||||
System.getenv('TEMPDIR') ?:
|
||||
System.getenv('TMPDIR') ?:
|
||||
'/tmp'
|
||||
).toAbsolutePath()
|
||||
|
||||
mount_temp {
|
||||
docker.temp = tempDir
|
||||
podman.temp = tempDir
|
||||
charliecloud.temp = tempDir
|
||||
}
|
||||
|
||||
no_publish {
|
||||
process {
|
||||
withName: '.*' {
|
||||
publishDir = [
|
||||
enabled: false
|
||||
]
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
docker {
|
||||
docker.fixOwnership = true
|
||||
docker.enabled = true
|
||||
singularity.enabled = false
|
||||
podman.enabled = false
|
||||
shifter.enabled = false
|
||||
charliecloud.enabled = false
|
||||
}
|
||||
|
||||
local {
|
||||
// This config is for local processing.
|
||||
process {
|
||||
maxMemory = 25.GB
|
||||
withLabel: verylowcpu { cpus = 2 }
|
||||
withLabel: lowcpu { cpus = 4 }
|
||||
withLabel: midcpu { cpus = 6 }
|
||||
withLabel: highcpu { cpus = 12 }
|
||||
|
||||
withLabel: lowmem { memory = { get_memory( 8.GB * task.attempt ) } }
|
||||
withLabel: midmem { memory = { get_memory( 12.GB * task.attempt ) } }
|
||||
withLabel: highmem { memory = { get_memory( 20.GB * task.attempt ) } }
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
def get_memory(to_compare) {
|
||||
if (!process.containsKey("maxMemory") || !process.maxMemory) {
|
||||
return to_compare
|
||||
}
|
||||
|
||||
try {
|
||||
if (process.containsKey("maxRetries") && process.maxRetries && task.attempt == (process.maxRetries as int)) {
|
||||
return process.maxMemory
|
||||
}
|
||||
else if (to_compare.compareTo(process.maxMemory as nextflow.util.MemoryUnit) == 1) {
|
||||
return max_memory as nextflow.util.MemoryUnit
|
||||
}
|
||||
else {
|
||||
return to_compare
|
||||
}
|
||||
} catch (all) {
|
||||
println "Error processing memory resources. Please check that process.maxMemory '${process.maxMemory}' and process.maxRetries '${process.maxRetries}' are valid!"
|
||||
System.exit(1)
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,318 @@
|
||||
name: "grep_annotation_column"
|
||||
namespace: "metadata"
|
||||
version: "2.1.2"
|
||||
authors:
|
||||
- name: "Dries Schaumont"
|
||||
roles:
|
||||
- "maintainer"
|
||||
info:
|
||||
role: "Core Team Member"
|
||||
links:
|
||||
email: "dries@data-intuitive.com"
|
||||
github: "DriesSchaumont"
|
||||
orcid: "0000-0002-4389-0440"
|
||||
linkedin: "dries-schaumont"
|
||||
organizations:
|
||||
- name: "Data Intuitive"
|
||||
href: "https://www.data-intuitive.com"
|
||||
role: "Data Scientist"
|
||||
argument_groups:
|
||||
- name: "Inputs"
|
||||
description: "Arguments related to the input dataset."
|
||||
arguments:
|
||||
- type: "file"
|
||||
name: "--input"
|
||||
alternatives:
|
||||
- "-i"
|
||||
description: "Path to the input .h5mu."
|
||||
info: null
|
||||
example:
|
||||
- "sample_path"
|
||||
must_exist: true
|
||||
create_parent: true
|
||||
required: true
|
||||
direction: "input"
|
||||
multiple: false
|
||||
multiple_sep: ";"
|
||||
- type: "string"
|
||||
name: "--input_column"
|
||||
description: "Column to query. If not specified, use .var_names or .obs_names,\
|
||||
\ depending on the value of --matrix"
|
||||
info: null
|
||||
required: false
|
||||
direction: "input"
|
||||
multiple: false
|
||||
multiple_sep: ";"
|
||||
- type: "string"
|
||||
name: "--input_layer"
|
||||
description: "Input data to use when calculating fraction of observations that\
|
||||
\ match with the query. \nOnly used when --output_fraction_column is provided.\
|
||||
\ If not specified, .X is used.\n"
|
||||
info: null
|
||||
required: false
|
||||
direction: "input"
|
||||
multiple: false
|
||||
multiple_sep: ";"
|
||||
- type: "string"
|
||||
name: "--modality"
|
||||
description: "Which modality to get the annotation matrix from.\n"
|
||||
info: null
|
||||
example:
|
||||
- "rna"
|
||||
required: true
|
||||
direction: "input"
|
||||
multiple: false
|
||||
multiple_sep: ";"
|
||||
- type: "string"
|
||||
name: "--matrix"
|
||||
description: "Matrix to fetch the column from that will be searched."
|
||||
info: null
|
||||
example:
|
||||
- "var"
|
||||
required: false
|
||||
choices:
|
||||
- "var"
|
||||
- "obs"
|
||||
direction: "input"
|
||||
multiple: false
|
||||
multiple_sep: ";"
|
||||
- name: "Outputs"
|
||||
description: "Arguments related to how the output will be written."
|
||||
arguments:
|
||||
- type: "file"
|
||||
name: "--output"
|
||||
alternatives:
|
||||
- "-o"
|
||||
info: null
|
||||
example:
|
||||
- "output.h5mu"
|
||||
must_exist: true
|
||||
create_parent: true
|
||||
required: false
|
||||
direction: "output"
|
||||
multiple: false
|
||||
multiple_sep: ";"
|
||||
- type: "string"
|
||||
name: "--output_compression"
|
||||
description: "The compression format to be used on the output h5mu object."
|
||||
info: null
|
||||
example:
|
||||
- "gzip"
|
||||
required: false
|
||||
choices:
|
||||
- "gzip"
|
||||
- "lzf"
|
||||
direction: "input"
|
||||
multiple: false
|
||||
multiple_sep: ";"
|
||||
- type: "string"
|
||||
name: "--output_match_column"
|
||||
description: "Name of the column to write the result to."
|
||||
info: null
|
||||
required: true
|
||||
direction: "input"
|
||||
multiple: false
|
||||
multiple_sep: ";"
|
||||
- type: "string"
|
||||
name: "--output_fraction_column"
|
||||
description: "For the opposite axis, name of the column to write the fraction\
|
||||
\ of \nobservations that matches to the pattern.\n"
|
||||
info: null
|
||||
required: false
|
||||
direction: "input"
|
||||
multiple: false
|
||||
multiple_sep: ";"
|
||||
- name: "Query options"
|
||||
description: "Options related to the query"
|
||||
arguments:
|
||||
- type: "string"
|
||||
name: "--regex_pattern"
|
||||
description: "Regex to use to match with the input column."
|
||||
info: null
|
||||
example:
|
||||
- "^[mM][tT]-"
|
||||
required: true
|
||||
direction: "input"
|
||||
multiple: false
|
||||
multiple_sep: ";"
|
||||
resources:
|
||||
- type: "python_script"
|
||||
path: "script.py"
|
||||
is_executable: true
|
||||
- type: "file"
|
||||
path: "setup_logger.py"
|
||||
- type: "file"
|
||||
path: "compress_h5mu.py"
|
||||
- type: "file"
|
||||
path: "nextflow_labels.config"
|
||||
dest: "nextflow_labels.config"
|
||||
description: "Perform a regex lookup on a column from the annotation matrices .obs\
|
||||
\ or .var.\nThe annotation matrix can originate from either a modality, or all modalities\
|
||||
\ (global .var or .obs).\n"
|
||||
test_resources:
|
||||
- type: "python_script"
|
||||
path: "test.py"
|
||||
is_executable: true
|
||||
- type: "file"
|
||||
path: "e18_mouse_brain_fresh_5k_filtered_feature_bc_matrix_subset_unique_obs.h5mu"
|
||||
info: null
|
||||
status: "enabled"
|
||||
scope:
|
||||
image: "public"
|
||||
target: "public"
|
||||
license: "MIT"
|
||||
links:
|
||||
repository: "https://github.com/openpipelines-bio/openpipeline"
|
||||
docker_registry: "ghcr.io"
|
||||
runners:
|
||||
- type: "executable"
|
||||
id: "executable"
|
||||
docker_setup_strategy: "ifneedbepullelsecachedbuild"
|
||||
- type: "nextflow"
|
||||
id: "nextflow"
|
||||
directives:
|
||||
label:
|
||||
- "singlecpu"
|
||||
- "lowmem"
|
||||
tag: "$id"
|
||||
auto:
|
||||
simplifyInput: true
|
||||
simplifyOutput: false
|
||||
transcript: false
|
||||
publish: false
|
||||
config:
|
||||
labels:
|
||||
mem1gb: "memory = 1000000000.B"
|
||||
mem2gb: "memory = 2000000000.B"
|
||||
mem5gb: "memory = 5000000000.B"
|
||||
mem10gb: "memory = 10000000000.B"
|
||||
mem20gb: "memory = 20000000000.B"
|
||||
mem50gb: "memory = 50000000000.B"
|
||||
mem100gb: "memory = 100000000000.B"
|
||||
mem200gb: "memory = 200000000000.B"
|
||||
mem500gb: "memory = 500000000000.B"
|
||||
mem1tb: "memory = 1000000000000.B"
|
||||
mem2tb: "memory = 2000000000000.B"
|
||||
mem5tb: "memory = 5000000000000.B"
|
||||
mem10tb: "memory = 10000000000000.B"
|
||||
mem20tb: "memory = 20000000000000.B"
|
||||
mem50tb: "memory = 50000000000000.B"
|
||||
mem100tb: "memory = 100000000000000.B"
|
||||
mem200tb: "memory = 200000000000000.B"
|
||||
mem500tb: "memory = 500000000000000.B"
|
||||
mem1gib: "memory = 1073741824.B"
|
||||
mem2gib: "memory = 2147483648.B"
|
||||
mem4gib: "memory = 4294967296.B"
|
||||
mem8gib: "memory = 8589934592.B"
|
||||
mem16gib: "memory = 17179869184.B"
|
||||
mem32gib: "memory = 34359738368.B"
|
||||
mem64gib: "memory = 68719476736.B"
|
||||
mem128gib: "memory = 137438953472.B"
|
||||
mem256gib: "memory = 274877906944.B"
|
||||
mem512gib: "memory = 549755813888.B"
|
||||
mem1tib: "memory = 1099511627776.B"
|
||||
mem2tib: "memory = 2199023255552.B"
|
||||
mem4tib: "memory = 4398046511104.B"
|
||||
mem8tib: "memory = 8796093022208.B"
|
||||
mem16tib: "memory = 17592186044416.B"
|
||||
mem32tib: "memory = 35184372088832.B"
|
||||
mem64tib: "memory = 70368744177664.B"
|
||||
mem128tib: "memory = 140737488355328.B"
|
||||
mem256tib: "memory = 281474976710656.B"
|
||||
mem512tib: "memory = 562949953421312.B"
|
||||
cpu1: "cpus = 1"
|
||||
cpu2: "cpus = 2"
|
||||
cpu5: "cpus = 5"
|
||||
cpu10: "cpus = 10"
|
||||
cpu20: "cpus = 20"
|
||||
cpu50: "cpus = 50"
|
||||
cpu100: "cpus = 100"
|
||||
cpu200: "cpus = 200"
|
||||
cpu500: "cpus = 500"
|
||||
cpu1000: "cpus = 1000"
|
||||
script:
|
||||
- "includeConfig(\"nextflow_labels.config\")"
|
||||
debug: false
|
||||
container: "docker"
|
||||
engines:
|
||||
- type: "docker"
|
||||
id: "docker"
|
||||
image: "python:3.11-slim"
|
||||
target_tag: "2.1.0"
|
||||
namespace_separator: "/"
|
||||
setup:
|
||||
- type: "apt"
|
||||
packages:
|
||||
- "procps"
|
||||
interactive: false
|
||||
- type: "python"
|
||||
user: false
|
||||
packages:
|
||||
- "anndata~=0.11.1"
|
||||
- "mudata~=0.3.1"
|
||||
script:
|
||||
- "exec(\"try:\\n import awkward\\nexcept ModuleNotFoundError:\\n exit(0)\\\
|
||||
nelse: exit(1)\")"
|
||||
upgrade: true
|
||||
test_setup:
|
||||
- type: "apt"
|
||||
packages:
|
||||
- "git"
|
||||
interactive: false
|
||||
- type: "python"
|
||||
user: false
|
||||
packages:
|
||||
- "viashpy==0.8.0"
|
||||
github:
|
||||
- "openpipelines-bio/core#subdirectory=packages/python/openpipeline_testutils"
|
||||
upgrade: true
|
||||
entrypoint: []
|
||||
cmd: null
|
||||
build_info:
|
||||
config: "src/metadata/grep_annotation_column/config.vsh.yaml"
|
||||
runner: "nextflow"
|
||||
engine: "docker"
|
||||
output: "target/nextflow/metadata/grep_annotation_column"
|
||||
executable: "target/nextflow/metadata/grep_annotation_column/main.nf"
|
||||
viash_version: "0.9.4"
|
||||
git_commit: "a0c9522486585774f76416150f8a3291409b5363"
|
||||
git_remote: "https://github.com/openpipelines-bio/openpipeline"
|
||||
git_tag: "2.1.1-2-ga0c95224865"
|
||||
package_config:
|
||||
name: "openpipeline"
|
||||
summary: "Best-practice workflows for single-cell multi-omics analyses.\n"
|
||||
description: "OpenPipelines are extensible single cell analysis pipelines for reproducible\
|
||||
\ and large-scale single cell processing using [Viash](https://viash.io) and [Nextflow](https://www.nextflow.io/).\n\
|
||||
\nIn terms of workflows, the following has been made available, but keep in mind\
|
||||
\ that\nindividual tools and functionality can be executed as standalone components\
|
||||
\ as well.\n\n * Demultiplexing: conversion of raw sequencing data to FASTQ objects.\n\
|
||||
\ * Ingestion: Read mapping and generating a count matrix.\n * Single sample\
|
||||
\ processing: cell filtering and doublet detection.\n * Multisample processing:\
|
||||
\ Count transformation, normalization, QC metric calulations.\n * Integration:\
|
||||
\ Clustering, integration and batch correction using single and multimodal methods.\n\
|
||||
\ * Downstream analysis workflows\n"
|
||||
info:
|
||||
test_resources:
|
||||
- type: "s3"
|
||||
path: "s3://openpipelines-data"
|
||||
dest: "resources_test"
|
||||
viash_version: "0.9.4"
|
||||
source: "src"
|
||||
target: "target"
|
||||
config_mods:
|
||||
- ".resources += {path: '/src/workflows/utils/labels.config', dest: 'nextflow_labels.config'}\n\
|
||||
.runners[.type == 'nextflow'].config.script := 'includeConfig(\"nextflow_labels.config\"\
|
||||
)'"
|
||||
- ".version := \"2.1.2\""
|
||||
- ".engines[.type == 'docker'].target_tag := '2.1.0'"
|
||||
keywords:
|
||||
- "single-cell"
|
||||
- "multimodal"
|
||||
license: "MIT"
|
||||
organization: "openpipelines-bio"
|
||||
links:
|
||||
repository: "https://github.com/openpipelines-bio/openpipeline"
|
||||
docker_registry: "ghcr.io"
|
||||
homepage: "https://openpipelines.bio"
|
||||
documentation: "https://openpipelines.bio/fundamentals"
|
||||
issue_tracker: "https://github.com/openpipelines-bio/openpipeline/issues"
|
||||
@@ -0,0 +1,87 @@
|
||||
import shutil
|
||||
from anndata import AnnData
|
||||
from mudata import write_h5ad
|
||||
from h5py import File as H5File
|
||||
from h5py import Group, Dataset
|
||||
from pathlib import Path
|
||||
from typing import Union, Literal
|
||||
from functools import partial
|
||||
|
||||
|
||||
def compress_h5mu(
|
||||
input_path: Union[str, Path],
|
||||
output_path: Union[str, Path],
|
||||
compression: Union[Literal["gzip"], Literal["lzf"]],
|
||||
):
|
||||
input_path, output_path = str(input_path), str(output_path)
|
||||
|
||||
def copy_attributes(in_object, out_object):
|
||||
for key, value in in_object.attrs.items():
|
||||
out_object.attrs[key] = value
|
||||
|
||||
def visit_path(
|
||||
output_h5: H5File,
|
||||
compression: Union[Literal["gzip"], Literal["lzf"]],
|
||||
name: str,
|
||||
object: Union[Group, Dataset],
|
||||
):
|
||||
if isinstance(object, Group):
|
||||
new_group = output_h5.create_group(name)
|
||||
copy_attributes(object, new_group)
|
||||
elif isinstance(object, Dataset):
|
||||
# Compression only works for non-scalar Dataset objects
|
||||
# Scalar objects dont have a shape defined
|
||||
if not object.compression and object.shape not in [None, ()]:
|
||||
new_dataset = output_h5.create_dataset(
|
||||
name, data=object, compression=compression
|
||||
)
|
||||
copy_attributes(object, new_dataset)
|
||||
else:
|
||||
output_h5.copy(object, name)
|
||||
else:
|
||||
raise NotImplementedError(
|
||||
f"Could not copy element {name}, "
|
||||
f"type has not been implemented yet: {type(object)}"
|
||||
)
|
||||
|
||||
with (
|
||||
H5File(input_path, "r") as input_h5,
|
||||
H5File(output_path, "w", userblock_size=512) as output_h5,
|
||||
):
|
||||
copy_attributes(input_h5, output_h5)
|
||||
input_h5.visititems(partial(visit_path, output_h5, compression))
|
||||
|
||||
with open(input_path, "rb") as input_bytes:
|
||||
# Mudata puts metadata like this in the first 512 bytes:
|
||||
# MuData (format-version=0.1.0;creator=muon;creator-version=0.2.0)
|
||||
# See mudata/_core/io.py, read_h5mu() function
|
||||
starting_metadata = input_bytes.read(100)
|
||||
# The metadata is padded with extra null bytes up until 512 bytes
|
||||
truncate_location = starting_metadata.find(b"\x00")
|
||||
starting_metadata = starting_metadata[:truncate_location]
|
||||
with open(output_path, "br+") as f:
|
||||
nbytes = f.write(starting_metadata)
|
||||
f.write(b"\0" * (512 - nbytes))
|
||||
|
||||
|
||||
def write_h5ad_to_h5mu_with_compression(
|
||||
output_file: Union[str, Path],
|
||||
h5mu: Union[str, Path],
|
||||
modality_name: str,
|
||||
modality_data: AnnData,
|
||||
output_compression=None,
|
||||
):
|
||||
output_file = Path(output_file)
|
||||
h5mu = Path(h5mu)
|
||||
output_file_uncompressed = (
|
||||
output_file.with_name(output_file.stem + "_uncompressed.h5mu")
|
||||
if output_compression
|
||||
else output_file
|
||||
)
|
||||
shutil.copyfile(h5mu, output_file_uncompressed)
|
||||
write_h5ad(filename=output_file_uncompressed, mod=modality_name, data=modality_data)
|
||||
if output_compression:
|
||||
compress_h5mu(
|
||||
output_file_uncompressed, output_file, compression=output_compression
|
||||
)
|
||||
output_file_uncompressed.unlink()
|
||||
File diff suppressed because it is too large
Load Diff
@@ -0,0 +1,126 @@
|
||||
manifest {
|
||||
name = 'metadata/grep_annotation_column'
|
||||
mainScript = 'main.nf'
|
||||
nextflowVersion = '!>=20.12.1-edge'
|
||||
version = '2.1.2'
|
||||
description = 'Perform a regex lookup on a column from the annotation matrices .obs or .var.\nThe annotation matrix can originate from either a modality, or all modalities (global .var or .obs).\n'
|
||||
author = 'Dries Schaumont'
|
||||
}
|
||||
|
||||
process.container = 'nextflow/bash:latest'
|
||||
|
||||
// detect tempdir
|
||||
tempDir = java.nio.file.Paths.get(
|
||||
System.getenv('NXF_TEMP') ?:
|
||||
System.getenv('VIASH_TEMP') ?:
|
||||
System.getenv('TEMPDIR') ?:
|
||||
System.getenv('TMPDIR') ?:
|
||||
'/tmp'
|
||||
).toAbsolutePath()
|
||||
|
||||
profiles {
|
||||
no_publish {
|
||||
process {
|
||||
withName: '.*' {
|
||||
publishDir = [
|
||||
enabled: false
|
||||
]
|
||||
}
|
||||
}
|
||||
}
|
||||
mount_temp {
|
||||
docker.temp = tempDir
|
||||
podman.temp = tempDir
|
||||
charliecloud.temp = tempDir
|
||||
}
|
||||
docker {
|
||||
docker.enabled = true
|
||||
// docker.userEmulation = true
|
||||
singularity.enabled = false
|
||||
podman.enabled = false
|
||||
shifter.enabled = false
|
||||
charliecloud.enabled = false
|
||||
}
|
||||
singularity {
|
||||
singularity.enabled = true
|
||||
singularity.autoMounts = true
|
||||
docker.enabled = false
|
||||
podman.enabled = false
|
||||
shifter.enabled = false
|
||||
charliecloud.enabled = false
|
||||
}
|
||||
podman {
|
||||
podman.enabled = true
|
||||
docker.enabled = false
|
||||
singularity.enabled = false
|
||||
shifter.enabled = false
|
||||
charliecloud.enabled = false
|
||||
}
|
||||
shifter {
|
||||
shifter.enabled = true
|
||||
docker.enabled = false
|
||||
singularity.enabled = false
|
||||
podman.enabled = false
|
||||
charliecloud.enabled = false
|
||||
}
|
||||
charliecloud {
|
||||
charliecloud.enabled = true
|
||||
docker.enabled = false
|
||||
singularity.enabled = false
|
||||
podman.enabled = false
|
||||
shifter.enabled = false
|
||||
}
|
||||
}
|
||||
|
||||
process{
|
||||
withLabel: mem1gb { memory = 1000000000.B }
|
||||
withLabel: mem2gb { memory = 2000000000.B }
|
||||
withLabel: mem5gb { memory = 5000000000.B }
|
||||
withLabel: mem10gb { memory = 10000000000.B }
|
||||
withLabel: mem20gb { memory = 20000000000.B }
|
||||
withLabel: mem50gb { memory = 50000000000.B }
|
||||
withLabel: mem100gb { memory = 100000000000.B }
|
||||
withLabel: mem200gb { memory = 200000000000.B }
|
||||
withLabel: mem500gb { memory = 500000000000.B }
|
||||
withLabel: mem1tb { memory = 1000000000000.B }
|
||||
withLabel: mem2tb { memory = 2000000000000.B }
|
||||
withLabel: mem5tb { memory = 5000000000000.B }
|
||||
withLabel: mem10tb { memory = 10000000000000.B }
|
||||
withLabel: mem20tb { memory = 20000000000000.B }
|
||||
withLabel: mem50tb { memory = 50000000000000.B }
|
||||
withLabel: mem100tb { memory = 100000000000000.B }
|
||||
withLabel: mem200tb { memory = 200000000000000.B }
|
||||
withLabel: mem500tb { memory = 500000000000000.B }
|
||||
withLabel: mem1gib { memory = 1073741824.B }
|
||||
withLabel: mem2gib { memory = 2147483648.B }
|
||||
withLabel: mem4gib { memory = 4294967296.B }
|
||||
withLabel: mem8gib { memory = 8589934592.B }
|
||||
withLabel: mem16gib { memory = 17179869184.B }
|
||||
withLabel: mem32gib { memory = 34359738368.B }
|
||||
withLabel: mem64gib { memory = 68719476736.B }
|
||||
withLabel: mem128gib { memory = 137438953472.B }
|
||||
withLabel: mem256gib { memory = 274877906944.B }
|
||||
withLabel: mem512gib { memory = 549755813888.B }
|
||||
withLabel: mem1tib { memory = 1099511627776.B }
|
||||
withLabel: mem2tib { memory = 2199023255552.B }
|
||||
withLabel: mem4tib { memory = 4398046511104.B }
|
||||
withLabel: mem8tib { memory = 8796093022208.B }
|
||||
withLabel: mem16tib { memory = 17592186044416.B }
|
||||
withLabel: mem32tib { memory = 35184372088832.B }
|
||||
withLabel: mem64tib { memory = 70368744177664.B }
|
||||
withLabel: mem128tib { memory = 140737488355328.B }
|
||||
withLabel: mem256tib { memory = 281474976710656.B }
|
||||
withLabel: mem512tib { memory = 562949953421312.B }
|
||||
withLabel: cpu1 { cpus = 1 }
|
||||
withLabel: cpu2 { cpus = 2 }
|
||||
withLabel: cpu5 { cpus = 5 }
|
||||
withLabel: cpu10 { cpus = 10 }
|
||||
withLabel: cpu20 { cpus = 20 }
|
||||
withLabel: cpu50 { cpus = 50 }
|
||||
withLabel: cpu100 { cpus = 100 }
|
||||
withLabel: cpu200 { cpus = 200 }
|
||||
withLabel: cpu500 { cpus = 500 }
|
||||
withLabel: cpu1000 { cpus = 1000 }
|
||||
}
|
||||
|
||||
includeConfig("nextflow_labels.config")
|
||||
@@ -0,0 +1,66 @@
|
||||
process {
|
||||
// Default resources for components that hardly do any processing
|
||||
memory = { 2.GB * task.attempt }
|
||||
cpus = 1
|
||||
|
||||
// Retry for exit codes that have something to do with memory issues
|
||||
errorStrategy = { task.exitStatus in 137..140 ? 'retry' : 'terminate' }
|
||||
maxRetries = 3
|
||||
maxMemory = null
|
||||
|
||||
// CPU resources
|
||||
withLabel: singlecpu { cpus = 1 }
|
||||
withLabel: lowcpu { cpus = 4 }
|
||||
withLabel: midcpu { cpus = 10 }
|
||||
withLabel: highcpu { cpus = 20 }
|
||||
|
||||
// Memory resources
|
||||
withLabel: lowmem { memory = { get_memory( 4.GB * task.attempt ) } }
|
||||
withLabel: midmem { memory = { get_memory( 25.GB * task.attempt ) } }
|
||||
withLabel: highmem { memory = { get_memory( 50.GB * task.attempt ) } }
|
||||
withLabel: veryhighmem { memory = { get_memory( 75.GB * task.attempt ) } }
|
||||
|
||||
// Disk space
|
||||
withLabel: lowdisk {
|
||||
disk = {process.disk ? process.disk : null}
|
||||
}
|
||||
withLabel: middisk {
|
||||
disk = {process.disk ? process.disk : null}
|
||||
}
|
||||
withLabel: highdisk {
|
||||
disk = {process.disk ? process.disk : null}
|
||||
}
|
||||
withLabel: veryhighdisk {
|
||||
disk = {process.disk ? process.disk : null}
|
||||
}
|
||||
|
||||
// NOTE: The above labels intentionally do not have an effect by default.
|
||||
// The user should set the disk space requirements by adding the following
|
||||
// to the compute environment:
|
||||
//
|
||||
// withLabel: lowdisk { disk = { 20.GB * task.attempt } }
|
||||
// withLabel: middisk { disk = { 100.GB * task.attempt } }
|
||||
// withLabel: highdisk { disk = { 200.GB * task.attempt } }
|
||||
// withLabel: veryhighdisk { disk = { 500.GB * task.attempt } }
|
||||
}
|
||||
|
||||
def get_memory(to_compare) {
|
||||
if (!process.containsKey("maxMemory") || !process.maxMemory) {
|
||||
return to_compare
|
||||
}
|
||||
|
||||
try {
|
||||
if (process.containsKey("maxRetries") && process.maxRetries && task.attempt == (process.maxRetries as int)) {
|
||||
return process.maxMemory
|
||||
}
|
||||
else if (to_compare.compareTo(process.maxMemory as nextflow.util.MemoryUnit) == 1) {
|
||||
return max_memory as nextflow.util.MemoryUnit
|
||||
}
|
||||
else {
|
||||
return to_compare
|
||||
}
|
||||
} catch (all) {
|
||||
println "Error processing memory resources. Please check that process.maxMemory '${process.maxMemory}' and process.maxRetries '${process.maxRetries}' are valid!"
|
||||
System.exit(1)
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,21 @@
|
||||
# Inputs
|
||||
input: # please fill in - example: "sample_path"
|
||||
# input_column: "foo"
|
||||
# input_layer: "foo"
|
||||
modality: # please fill in - example: "rna"
|
||||
# matrix: "var"
|
||||
|
||||
# Outputs
|
||||
# output: "$id.$key.output.h5mu"
|
||||
# output_compression: "gzip"
|
||||
output_match_column: # please fill in - example: "foo"
|
||||
# output_fraction_column: "foo"
|
||||
|
||||
# Query options
|
||||
regex_pattern: # please fill in - example: "^[mM][tT]-"
|
||||
|
||||
# Nextflow input-output arguments
|
||||
publish_dir: # please fill in - example: "output/"
|
||||
# param_list: "my_params.yaml"
|
||||
|
||||
# Arguments
|
||||
@@ -0,0 +1,200 @@
|
||||
{
|
||||
"$schema": "http://json-schema.org/draft-07/schema",
|
||||
"title": "grep_annotation_column",
|
||||
"description": "Perform a regex lookup on a column from the annotation matrices .obs or .var.\nThe annotation matrix can originate from either a modality, or all modalities (global .var or .obs).\n",
|
||||
"type": "object",
|
||||
"definitions": {
|
||||
|
||||
|
||||
"Dataset input": {
|
||||
"title": "Dataset input",
|
||||
"type": "object",
|
||||
"description": "Dataset input using nf-tower \"dataset\" or \"data explorer\". Allows for the input of multiple parameter sets to initialise a Nextflow channel.",
|
||||
"properties": {
|
||||
"param_list": {
|
||||
"description": "Dataset input can either be a list of maps, a csv file, a json file, a yaml file, or simply a yaml blob. The names of the input fields (e.g. csv columns, json keys) need to be an exact match with the workflow input parameters.",
|
||||
"default": "",
|
||||
"format": "file-path",
|
||||
"mimetype": "text/csv",
|
||||
"pattern": "^\\S+\\.csv$"
|
||||
}
|
||||
}
|
||||
},
|
||||
|
||||
|
||||
|
||||
"inputs" : {
|
||||
"title": "Inputs",
|
||||
"type": "object",
|
||||
"description": "Arguments related to the input dataset.",
|
||||
"properties": {
|
||||
|
||||
|
||||
"input": {
|
||||
"type":
|
||||
"string",
|
||||
"description": "Type: `file`, required, example: `sample_path`. Path to the input ",
|
||||
"help_text": "Type: `file`, required, example: `sample_path`. Path to the input .h5mu."
|
||||
|
||||
}
|
||||
|
||||
|
||||
,
|
||||
"input_column": {
|
||||
"type":
|
||||
"string",
|
||||
"description": "Type: `string`. Column to query",
|
||||
"help_text": "Type: `string`. Column to query. If not specified, use .var_names or .obs_names, depending on the value of --matrix"
|
||||
|
||||
}
|
||||
|
||||
|
||||
,
|
||||
"input_layer": {
|
||||
"type":
|
||||
"string",
|
||||
"description": "Type: `string`. Input data to use when calculating fraction of observations that match with the query",
|
||||
"help_text": "Type: `string`. Input data to use when calculating fraction of observations that match with the query. \nOnly used when --output_fraction_column is provided. If not specified, .X is used.\n"
|
||||
|
||||
}
|
||||
|
||||
|
||||
,
|
||||
"modality": {
|
||||
"type":
|
||||
"string",
|
||||
"description": "Type: `string`, required, example: `rna`. Which modality to get the annotation matrix from",
|
||||
"help_text": "Type: `string`, required, example: `rna`. Which modality to get the annotation matrix from.\n"
|
||||
|
||||
}
|
||||
|
||||
|
||||
,
|
||||
"matrix": {
|
||||
"type":
|
||||
"string",
|
||||
"description": "Type: `string`, example: `var`, choices: ``var`, `obs``. Matrix to fetch the column from that will be searched",
|
||||
"help_text": "Type: `string`, example: `var`, choices: ``var`, `obs``. Matrix to fetch the column from that will be searched.",
|
||||
"enum": ["var", "obs"]
|
||||
|
||||
|
||||
}
|
||||
|
||||
|
||||
}
|
||||
},
|
||||
|
||||
|
||||
"outputs" : {
|
||||
"title": "Outputs",
|
||||
"type": "object",
|
||||
"description": "Arguments related to how the output will be written.",
|
||||
"properties": {
|
||||
|
||||
|
||||
"output": {
|
||||
"type":
|
||||
"string",
|
||||
"description": "Type: `file`, default: `$id.$key.output.h5mu`, example: `output.h5mu`. ",
|
||||
"help_text": "Type: `file`, default: `$id.$key.output.h5mu`, example: `output.h5mu`. "
|
||||
,
|
||||
"default":"$id.$key.output.h5mu"
|
||||
}
|
||||
|
||||
|
||||
,
|
||||
"output_compression": {
|
||||
"type":
|
||||
"string",
|
||||
"description": "Type: `string`, example: `gzip`, choices: ``gzip`, `lzf``. The compression format to be used on the output h5mu object",
|
||||
"help_text": "Type: `string`, example: `gzip`, choices: ``gzip`, `lzf``. The compression format to be used on the output h5mu object.",
|
||||
"enum": ["gzip", "lzf"]
|
||||
|
||||
|
||||
}
|
||||
|
||||
|
||||
,
|
||||
"output_match_column": {
|
||||
"type":
|
||||
"string",
|
||||
"description": "Type: `string`, required. Name of the column to write the result to",
|
||||
"help_text": "Type: `string`, required. Name of the column to write the result to."
|
||||
|
||||
}
|
||||
|
||||
|
||||
,
|
||||
"output_fraction_column": {
|
||||
"type":
|
||||
"string",
|
||||
"description": "Type: `string`. For the opposite axis, name of the column to write the fraction of \nobservations that matches to the pattern",
|
||||
"help_text": "Type: `string`. For the opposite axis, name of the column to write the fraction of \nobservations that matches to the pattern.\n"
|
||||
|
||||
}
|
||||
|
||||
|
||||
}
|
||||
},
|
||||
|
||||
|
||||
"query options" : {
|
||||
"title": "Query options",
|
||||
"type": "object",
|
||||
"description": "Options related to the query",
|
||||
"properties": {
|
||||
|
||||
|
||||
"regex_pattern": {
|
||||
"type":
|
||||
"string",
|
||||
"description": "Type: `string`, required, example: `^[mM][tT]-`. Regex to use to match with the input column",
|
||||
"help_text": "Type: `string`, required, example: `^[mM][tT]-`. Regex to use to match with the input column."
|
||||
|
||||
}
|
||||
|
||||
|
||||
}
|
||||
},
|
||||
|
||||
|
||||
"nextflow input-output arguments" : {
|
||||
"title": "Nextflow input-output arguments",
|
||||
"type": "object",
|
||||
"description": "Input/output parameters for Nextflow itself. Please note that both publishDir and publish_dir are supported but at least one has to be configured.",
|
||||
"properties": {
|
||||
|
||||
|
||||
"publish_dir": {
|
||||
"type":
|
||||
"string",
|
||||
"description": "Type: `string`, required, example: `output/`. Path to an output directory",
|
||||
"help_text": "Type: `string`, required, example: `output/`. Path to an output directory."
|
||||
|
||||
}
|
||||
|
||||
|
||||
|
||||
|
||||
}
|
||||
}
|
||||
},
|
||||
"allOf": [
|
||||
|
||||
{
|
||||
"$ref": "#/definitions/inputs"
|
||||
},
|
||||
|
||||
{
|
||||
"$ref": "#/definitions/outputs"
|
||||
},
|
||||
|
||||
{
|
||||
"$ref": "#/definitions/query options"
|
||||
},
|
||||
|
||||
{
|
||||
"$ref": "#/definitions/nextflow input-output arguments"
|
||||
}
|
||||
]
|
||||
}
|
||||
@@ -0,0 +1,12 @@
|
||||
def setup_logger():
|
||||
import logging
|
||||
from sys import stdout
|
||||
|
||||
logger = logging.getLogger()
|
||||
logger.setLevel(logging.INFO)
|
||||
console_handler = logging.StreamHandler(stdout)
|
||||
logFormatter = logging.Formatter("%(asctime)s %(levelname)-8s %(message)s")
|
||||
console_handler.setFormatter(logFormatter)
|
||||
logger.addHandler(console_handler)
|
||||
|
||||
return logger
|
||||
@@ -0,0 +1,376 @@
|
||||
name: "calculate_qc_metrics"
|
||||
namespace: "qc"
|
||||
version: "2.1.2"
|
||||
authors:
|
||||
- name: "Dries Schaumont"
|
||||
roles:
|
||||
- "author"
|
||||
info:
|
||||
role: "Core Team Member"
|
||||
links:
|
||||
email: "dries@data-intuitive.com"
|
||||
github: "DriesSchaumont"
|
||||
orcid: "0000-0002-4389-0440"
|
||||
linkedin: "dries-schaumont"
|
||||
organizations:
|
||||
- name: "Data Intuitive"
|
||||
href: "https://www.data-intuitive.com"
|
||||
role: "Data Scientist"
|
||||
argument_groups:
|
||||
- name: "Inputs"
|
||||
arguments:
|
||||
- type: "file"
|
||||
name: "--input"
|
||||
description: "Input h5mu file"
|
||||
info: null
|
||||
example:
|
||||
- "input.h5mu"
|
||||
must_exist: true
|
||||
create_parent: true
|
||||
required: true
|
||||
direction: "input"
|
||||
multiple: false
|
||||
multiple_sep: ";"
|
||||
- type: "string"
|
||||
name: "--modality"
|
||||
info: null
|
||||
default:
|
||||
- "rna"
|
||||
required: false
|
||||
direction: "input"
|
||||
multiple: false
|
||||
multiple_sep: ";"
|
||||
- type: "string"
|
||||
name: "--layer"
|
||||
info: null
|
||||
example:
|
||||
- "raw_counts"
|
||||
required: false
|
||||
direction: "input"
|
||||
multiple: false
|
||||
multiple_sep: ";"
|
||||
- name: "Metrics added to .obs"
|
||||
arguments:
|
||||
- type: "string"
|
||||
name: "--var_qc_metrics"
|
||||
description: "Keys to select a boolean (containing only True or False) column\
|
||||
\ from .var.\nFor each cell, calculate the proportion of total values for genes\
|
||||
\ which are labeled 'True', \ncompared to the total sum of the values for all\
|
||||
\ genes.\n"
|
||||
info: null
|
||||
example:
|
||||
- "ercc,highly_variable,mitochondrial"
|
||||
required: false
|
||||
direction: "input"
|
||||
multiple: true
|
||||
multiple_sep: ";"
|
||||
- type: "boolean"
|
||||
name: "--var_qc_metrics_fill_na_value"
|
||||
description: "Fill any 'NA' values found in the columns specified with --var_qc_metrics\
|
||||
\ to 'True' or 'False'.\nas False.\n"
|
||||
info: null
|
||||
required: false
|
||||
direction: "input"
|
||||
multiple: false
|
||||
multiple_sep: ";"
|
||||
- type: "integer"
|
||||
name: "--top_n_vars"
|
||||
description: "Number of top vars to be used to calculate cumulative proportions.\n\
|
||||
If not specified, proportions are not calculated. `--top_n_vars 20;50` finds\n\
|
||||
cumulative proportion to the 20th and 50th most expressed vars.\n"
|
||||
info: null
|
||||
required: false
|
||||
direction: "input"
|
||||
multiple: true
|
||||
multiple_sep: ";"
|
||||
- type: "string"
|
||||
name: "--output_obs_num_nonzero_vars"
|
||||
description: "Name of column in .obs describing, for each observation, the number\
|
||||
\ of stored values\n(including explicit zeroes). In other words, the name of\
|
||||
\ the column that counts\nfor each row the number of columns that contain data.\n"
|
||||
info: null
|
||||
default:
|
||||
- "num_nonzero_vars"
|
||||
required: false
|
||||
direction: "input"
|
||||
multiple: false
|
||||
multiple_sep: ";"
|
||||
- type: "string"
|
||||
name: "--output_obs_total_counts_vars"
|
||||
description: "Name of the column for .obs describing, for each observation (row),\n\
|
||||
the sum of the stored values in the columns.\n"
|
||||
info: null
|
||||
default:
|
||||
- "total_counts"
|
||||
required: false
|
||||
direction: "input"
|
||||
multiple: false
|
||||
multiple_sep: ";"
|
||||
- name: "Metrics added to .var"
|
||||
arguments:
|
||||
- type: "string"
|
||||
name: "--output_var_num_nonzero_obs"
|
||||
description: "Name of column describing, for each feature, the number of stored\
|
||||
\ values\n(including explicit zeroes). In other words, the name of the column\
|
||||
\ that counts\nfor each column the number of rows that contain data.\n"
|
||||
info: null
|
||||
default:
|
||||
- "num_nonzero_obs"
|
||||
required: false
|
||||
direction: "input"
|
||||
multiple: false
|
||||
multiple_sep: ";"
|
||||
- type: "string"
|
||||
name: "--output_var_total_counts_obs"
|
||||
description: "Name of the column in .var describing, for each feature (column),\n\
|
||||
the sum of the stored values in the rows.\n"
|
||||
info: null
|
||||
default:
|
||||
- "total_counts"
|
||||
required: false
|
||||
direction: "input"
|
||||
multiple: false
|
||||
multiple_sep: ";"
|
||||
- type: "string"
|
||||
name: "--output_var_obs_mean"
|
||||
description: "Name of the column in .obs providing the mean of the values in each\
|
||||
\ row.\n"
|
||||
info: null
|
||||
default:
|
||||
- "obs_mean"
|
||||
required: false
|
||||
direction: "input"
|
||||
multiple: false
|
||||
multiple_sep: ";"
|
||||
- type: "string"
|
||||
name: "--output_var_pct_dropout"
|
||||
description: "Name of the column in .obs providing for each feature the percentage\
|
||||
\ of\nobservations the feature does not appear on (i.e. is missing). Same as\
|
||||
\ `--num_nonzero_obs`\nbut percentage based.\n"
|
||||
info: null
|
||||
default:
|
||||
- "pct_dropout"
|
||||
required: false
|
||||
direction: "input"
|
||||
multiple: false
|
||||
multiple_sep: ";"
|
||||
- name: "Outputs"
|
||||
arguments:
|
||||
- type: "file"
|
||||
name: "--output"
|
||||
description: "Output h5mu file."
|
||||
info: null
|
||||
example:
|
||||
- "output.h5mu"
|
||||
must_exist: true
|
||||
create_parent: true
|
||||
required: false
|
||||
direction: "output"
|
||||
multiple: false
|
||||
multiple_sep: ";"
|
||||
- type: "string"
|
||||
name: "--output_compression"
|
||||
description: "The compression format to be used on the output h5mu object."
|
||||
info: null
|
||||
example:
|
||||
- "gzip"
|
||||
required: false
|
||||
choices:
|
||||
- "gzip"
|
||||
- "lzf"
|
||||
direction: "input"
|
||||
multiple: false
|
||||
multiple_sep: ";"
|
||||
resources:
|
||||
- type: "python_script"
|
||||
path: "script.py"
|
||||
is_executable: true
|
||||
- type: "file"
|
||||
path: "setup_logger.py"
|
||||
- type: "file"
|
||||
path: "compress_h5mu.py"
|
||||
- type: "file"
|
||||
path: "nextflow_labels.config"
|
||||
dest: "nextflow_labels.config"
|
||||
description: "Add basic quality control metrics to an .h5mu file.\n\nThe metrics are\
|
||||
\ comparable to what scanpy.pp.calculate_qc_metrics output,\nalthough they have\
|
||||
\ slightly different names:\n\nVar metrics (name in this component -> name in scanpy):\n\
|
||||
\ - pct_dropout -> pct_dropout_by_{expr_type}\n - num_nonzero_obs -> n_cells_by_{expr_type}\n\
|
||||
\ - obs_mean -> mean_{expr_type}\n - total_counts -> total_{expr_type}\n\n Obs\
|
||||
\ metrics:\n - num_nonzero_vars -> n_genes_by_{expr_type}\n - pct_{var_qc_metrics}\
|
||||
\ -> pct_{expr_type}_{qc_var}\n - total_counts_{var_qc_metrics} -> total_{expr_type}_{qc_var}\n\
|
||||
\ - pct_of_counts_in_top_{top_n_vars}_vars -> pct_{expr_type}_in_top_{n}_{var_type}\n\
|
||||
\ - total_counts -> total_{expr_type}\n \n"
|
||||
test_resources:
|
||||
- type: "python_script"
|
||||
path: "test.py"
|
||||
is_executable: true
|
||||
- type: "file"
|
||||
path: "pbmc_1k_protein_v3_filtered_feature_bc_matrix.h5mu"
|
||||
info: null
|
||||
status: "enabled"
|
||||
scope:
|
||||
image: "public"
|
||||
target: "public"
|
||||
license: "MIT"
|
||||
links:
|
||||
repository: "https://github.com/openpipelines-bio/openpipeline"
|
||||
docker_registry: "ghcr.io"
|
||||
runners:
|
||||
- type: "executable"
|
||||
id: "executable"
|
||||
docker_setup_strategy: "ifneedbepullelsecachedbuild"
|
||||
- type: "nextflow"
|
||||
id: "nextflow"
|
||||
directives:
|
||||
label:
|
||||
- "singlecpu"
|
||||
- "midmem"
|
||||
tag: "$id"
|
||||
auto:
|
||||
simplifyInput: true
|
||||
simplifyOutput: false
|
||||
transcript: false
|
||||
publish: false
|
||||
config:
|
||||
labels:
|
||||
mem1gb: "memory = 1000000000.B"
|
||||
mem2gb: "memory = 2000000000.B"
|
||||
mem5gb: "memory = 5000000000.B"
|
||||
mem10gb: "memory = 10000000000.B"
|
||||
mem20gb: "memory = 20000000000.B"
|
||||
mem50gb: "memory = 50000000000.B"
|
||||
mem100gb: "memory = 100000000000.B"
|
||||
mem200gb: "memory = 200000000000.B"
|
||||
mem500gb: "memory = 500000000000.B"
|
||||
mem1tb: "memory = 1000000000000.B"
|
||||
mem2tb: "memory = 2000000000000.B"
|
||||
mem5tb: "memory = 5000000000000.B"
|
||||
mem10tb: "memory = 10000000000000.B"
|
||||
mem20tb: "memory = 20000000000000.B"
|
||||
mem50tb: "memory = 50000000000000.B"
|
||||
mem100tb: "memory = 100000000000000.B"
|
||||
mem200tb: "memory = 200000000000000.B"
|
||||
mem500tb: "memory = 500000000000000.B"
|
||||
mem1gib: "memory = 1073741824.B"
|
||||
mem2gib: "memory = 2147483648.B"
|
||||
mem4gib: "memory = 4294967296.B"
|
||||
mem8gib: "memory = 8589934592.B"
|
||||
mem16gib: "memory = 17179869184.B"
|
||||
mem32gib: "memory = 34359738368.B"
|
||||
mem64gib: "memory = 68719476736.B"
|
||||
mem128gib: "memory = 137438953472.B"
|
||||
mem256gib: "memory = 274877906944.B"
|
||||
mem512gib: "memory = 549755813888.B"
|
||||
mem1tib: "memory = 1099511627776.B"
|
||||
mem2tib: "memory = 2199023255552.B"
|
||||
mem4tib: "memory = 4398046511104.B"
|
||||
mem8tib: "memory = 8796093022208.B"
|
||||
mem16tib: "memory = 17592186044416.B"
|
||||
mem32tib: "memory = 35184372088832.B"
|
||||
mem64tib: "memory = 70368744177664.B"
|
||||
mem128tib: "memory = 140737488355328.B"
|
||||
mem256tib: "memory = 281474976710656.B"
|
||||
mem512tib: "memory = 562949953421312.B"
|
||||
cpu1: "cpus = 1"
|
||||
cpu2: "cpus = 2"
|
||||
cpu5: "cpus = 5"
|
||||
cpu10: "cpus = 10"
|
||||
cpu20: "cpus = 20"
|
||||
cpu50: "cpus = 50"
|
||||
cpu100: "cpus = 100"
|
||||
cpu200: "cpus = 200"
|
||||
cpu500: "cpus = 500"
|
||||
cpu1000: "cpus = 1000"
|
||||
script:
|
||||
- "includeConfig(\"nextflow_labels.config\")"
|
||||
debug: false
|
||||
container: "docker"
|
||||
engines:
|
||||
- type: "docker"
|
||||
id: "docker"
|
||||
image: "python:3.11-slim"
|
||||
target_tag: "2.1.0"
|
||||
namespace_separator: "/"
|
||||
setup:
|
||||
- type: "apt"
|
||||
packages:
|
||||
- "procps"
|
||||
interactive: false
|
||||
- type: "python"
|
||||
user: false
|
||||
packages:
|
||||
- "anndata~=0.11.1"
|
||||
- "mudata~=0.3.1"
|
||||
- "scipy"
|
||||
script:
|
||||
- "exec(\"try:\\n import awkward\\nexcept ModuleNotFoundError:\\n exit(0)\\\
|
||||
nelse: exit(1)\")"
|
||||
upgrade: true
|
||||
test_setup:
|
||||
- type: "apt"
|
||||
packages:
|
||||
- "git"
|
||||
interactive: false
|
||||
- type: "python"
|
||||
user: false
|
||||
packages:
|
||||
- "viashpy==0.8.0"
|
||||
github:
|
||||
- "openpipelines-bio/core#subdirectory=packages/python/openpipeline_testutils"
|
||||
upgrade: true
|
||||
- type: "python"
|
||||
user: false
|
||||
packages:
|
||||
- "scanpy"
|
||||
upgrade: true
|
||||
entrypoint: []
|
||||
cmd: null
|
||||
build_info:
|
||||
config: "src/qc/calculate_qc_metrics/config.vsh.yaml"
|
||||
runner: "nextflow"
|
||||
engine: "docker"
|
||||
output: "target/nextflow/qc/calculate_qc_metrics"
|
||||
executable: "target/nextflow/qc/calculate_qc_metrics/main.nf"
|
||||
viash_version: "0.9.4"
|
||||
git_commit: "a0c9522486585774f76416150f8a3291409b5363"
|
||||
git_remote: "https://github.com/openpipelines-bio/openpipeline"
|
||||
git_tag: "2.1.1-2-ga0c95224865"
|
||||
package_config:
|
||||
name: "openpipeline"
|
||||
summary: "Best-practice workflows for single-cell multi-omics analyses.\n"
|
||||
description: "OpenPipelines are extensible single cell analysis pipelines for reproducible\
|
||||
\ and large-scale single cell processing using [Viash](https://viash.io) and [Nextflow](https://www.nextflow.io/).\n\
|
||||
\nIn terms of workflows, the following has been made available, but keep in mind\
|
||||
\ that\nindividual tools and functionality can be executed as standalone components\
|
||||
\ as well.\n\n * Demultiplexing: conversion of raw sequencing data to FASTQ objects.\n\
|
||||
\ * Ingestion: Read mapping and generating a count matrix.\n * Single sample\
|
||||
\ processing: cell filtering and doublet detection.\n * Multisample processing:\
|
||||
\ Count transformation, normalization, QC metric calulations.\n * Integration:\
|
||||
\ Clustering, integration and batch correction using single and multimodal methods.\n\
|
||||
\ * Downstream analysis workflows\n"
|
||||
info:
|
||||
test_resources:
|
||||
- type: "s3"
|
||||
path: "s3://openpipelines-data"
|
||||
dest: "resources_test"
|
||||
viash_version: "0.9.4"
|
||||
source: "src"
|
||||
target: "target"
|
||||
config_mods:
|
||||
- ".resources += {path: '/src/workflows/utils/labels.config', dest: 'nextflow_labels.config'}\n\
|
||||
.runners[.type == 'nextflow'].config.script := 'includeConfig(\"nextflow_labels.config\"\
|
||||
)'"
|
||||
- ".version := \"2.1.2\""
|
||||
- ".engines[.type == 'docker'].target_tag := '2.1.0'"
|
||||
keywords:
|
||||
- "single-cell"
|
||||
- "multimodal"
|
||||
license: "MIT"
|
||||
organization: "openpipelines-bio"
|
||||
links:
|
||||
repository: "https://github.com/openpipelines-bio/openpipeline"
|
||||
docker_registry: "ghcr.io"
|
||||
homepage: "https://openpipelines.bio"
|
||||
documentation: "https://openpipelines.bio/fundamentals"
|
||||
issue_tracker: "https://github.com/openpipelines-bio/openpipeline/issues"
|
||||
@@ -0,0 +1,87 @@
|
||||
import shutil
|
||||
from anndata import AnnData
|
||||
from mudata import write_h5ad
|
||||
from h5py import File as H5File
|
||||
from h5py import Group, Dataset
|
||||
from pathlib import Path
|
||||
from typing import Union, Literal
|
||||
from functools import partial
|
||||
|
||||
|
||||
def compress_h5mu(
|
||||
input_path: Union[str, Path],
|
||||
output_path: Union[str, Path],
|
||||
compression: Union[Literal["gzip"], Literal["lzf"]],
|
||||
):
|
||||
input_path, output_path = str(input_path), str(output_path)
|
||||
|
||||
def copy_attributes(in_object, out_object):
|
||||
for key, value in in_object.attrs.items():
|
||||
out_object.attrs[key] = value
|
||||
|
||||
def visit_path(
|
||||
output_h5: H5File,
|
||||
compression: Union[Literal["gzip"], Literal["lzf"]],
|
||||
name: str,
|
||||
object: Union[Group, Dataset],
|
||||
):
|
||||
if isinstance(object, Group):
|
||||
new_group = output_h5.create_group(name)
|
||||
copy_attributes(object, new_group)
|
||||
elif isinstance(object, Dataset):
|
||||
# Compression only works for non-scalar Dataset objects
|
||||
# Scalar objects dont have a shape defined
|
||||
if not object.compression and object.shape not in [None, ()]:
|
||||
new_dataset = output_h5.create_dataset(
|
||||
name, data=object, compression=compression
|
||||
)
|
||||
copy_attributes(object, new_dataset)
|
||||
else:
|
||||
output_h5.copy(object, name)
|
||||
else:
|
||||
raise NotImplementedError(
|
||||
f"Could not copy element {name}, "
|
||||
f"type has not been implemented yet: {type(object)}"
|
||||
)
|
||||
|
||||
with (
|
||||
H5File(input_path, "r") as input_h5,
|
||||
H5File(output_path, "w", userblock_size=512) as output_h5,
|
||||
):
|
||||
copy_attributes(input_h5, output_h5)
|
||||
input_h5.visititems(partial(visit_path, output_h5, compression))
|
||||
|
||||
with open(input_path, "rb") as input_bytes:
|
||||
# Mudata puts metadata like this in the first 512 bytes:
|
||||
# MuData (format-version=0.1.0;creator=muon;creator-version=0.2.0)
|
||||
# See mudata/_core/io.py, read_h5mu() function
|
||||
starting_metadata = input_bytes.read(100)
|
||||
# The metadata is padded with extra null bytes up until 512 bytes
|
||||
truncate_location = starting_metadata.find(b"\x00")
|
||||
starting_metadata = starting_metadata[:truncate_location]
|
||||
with open(output_path, "br+") as f:
|
||||
nbytes = f.write(starting_metadata)
|
||||
f.write(b"\0" * (512 - nbytes))
|
||||
|
||||
|
||||
def write_h5ad_to_h5mu_with_compression(
|
||||
output_file: Union[str, Path],
|
||||
h5mu: Union[str, Path],
|
||||
modality_name: str,
|
||||
modality_data: AnnData,
|
||||
output_compression=None,
|
||||
):
|
||||
output_file = Path(output_file)
|
||||
h5mu = Path(h5mu)
|
||||
output_file_uncompressed = (
|
||||
output_file.with_name(output_file.stem + "_uncompressed.h5mu")
|
||||
if output_compression
|
||||
else output_file
|
||||
)
|
||||
shutil.copyfile(h5mu, output_file_uncompressed)
|
||||
write_h5ad(filename=output_file_uncompressed, mod=modality_name, data=modality_data)
|
||||
if output_compression:
|
||||
compress_h5mu(
|
||||
output_file_uncompressed, output_file, compression=output_compression
|
||||
)
|
||||
output_file_uncompressed.unlink()
|
||||
File diff suppressed because it is too large
Load Diff
@@ -0,0 +1,126 @@
|
||||
manifest {
|
||||
name = 'qc/calculate_qc_metrics'
|
||||
mainScript = 'main.nf'
|
||||
nextflowVersion = '!>=20.12.1-edge'
|
||||
version = '2.1.2'
|
||||
description = 'Add basic quality control metrics to an .h5mu file.\n\nThe metrics are comparable to what scanpy.pp.calculate_qc_metrics output,\nalthough they have slightly different names:\n\nVar metrics (name in this component -> name in scanpy):\n - pct_dropout -> pct_dropout_by_{expr_type}\n - num_nonzero_obs -> n_cells_by_{expr_type}\n - obs_mean -> mean_{expr_type}\n - total_counts -> total_{expr_type}\n\n Obs metrics:\n - num_nonzero_vars -> n_genes_by_{expr_type}\n - pct_{var_qc_metrics} -> pct_{expr_type}_{qc_var}\n - total_counts_{var_qc_metrics} -> total_{expr_type}_{qc_var}\n - pct_of_counts_in_top_{top_n_vars}_vars -> pct_{expr_type}_in_top_{n}_{var_type}\n - total_counts -> total_{expr_type}\n \n'
|
||||
author = 'Dries Schaumont'
|
||||
}
|
||||
|
||||
process.container = 'nextflow/bash:latest'
|
||||
|
||||
// detect tempdir
|
||||
tempDir = java.nio.file.Paths.get(
|
||||
System.getenv('NXF_TEMP') ?:
|
||||
System.getenv('VIASH_TEMP') ?:
|
||||
System.getenv('TEMPDIR') ?:
|
||||
System.getenv('TMPDIR') ?:
|
||||
'/tmp'
|
||||
).toAbsolutePath()
|
||||
|
||||
profiles {
|
||||
no_publish {
|
||||
process {
|
||||
withName: '.*' {
|
||||
publishDir = [
|
||||
enabled: false
|
||||
]
|
||||
}
|
||||
}
|
||||
}
|
||||
mount_temp {
|
||||
docker.temp = tempDir
|
||||
podman.temp = tempDir
|
||||
charliecloud.temp = tempDir
|
||||
}
|
||||
docker {
|
||||
docker.enabled = true
|
||||
// docker.userEmulation = true
|
||||
singularity.enabled = false
|
||||
podman.enabled = false
|
||||
shifter.enabled = false
|
||||
charliecloud.enabled = false
|
||||
}
|
||||
singularity {
|
||||
singularity.enabled = true
|
||||
singularity.autoMounts = true
|
||||
docker.enabled = false
|
||||
podman.enabled = false
|
||||
shifter.enabled = false
|
||||
charliecloud.enabled = false
|
||||
}
|
||||
podman {
|
||||
podman.enabled = true
|
||||
docker.enabled = false
|
||||
singularity.enabled = false
|
||||
shifter.enabled = false
|
||||
charliecloud.enabled = false
|
||||
}
|
||||
shifter {
|
||||
shifter.enabled = true
|
||||
docker.enabled = false
|
||||
singularity.enabled = false
|
||||
podman.enabled = false
|
||||
charliecloud.enabled = false
|
||||
}
|
||||
charliecloud {
|
||||
charliecloud.enabled = true
|
||||
docker.enabled = false
|
||||
singularity.enabled = false
|
||||
podman.enabled = false
|
||||
shifter.enabled = false
|
||||
}
|
||||
}
|
||||
|
||||
process{
|
||||
withLabel: mem1gb { memory = 1000000000.B }
|
||||
withLabel: mem2gb { memory = 2000000000.B }
|
||||
withLabel: mem5gb { memory = 5000000000.B }
|
||||
withLabel: mem10gb { memory = 10000000000.B }
|
||||
withLabel: mem20gb { memory = 20000000000.B }
|
||||
withLabel: mem50gb { memory = 50000000000.B }
|
||||
withLabel: mem100gb { memory = 100000000000.B }
|
||||
withLabel: mem200gb { memory = 200000000000.B }
|
||||
withLabel: mem500gb { memory = 500000000000.B }
|
||||
withLabel: mem1tb { memory = 1000000000000.B }
|
||||
withLabel: mem2tb { memory = 2000000000000.B }
|
||||
withLabel: mem5tb { memory = 5000000000000.B }
|
||||
withLabel: mem10tb { memory = 10000000000000.B }
|
||||
withLabel: mem20tb { memory = 20000000000000.B }
|
||||
withLabel: mem50tb { memory = 50000000000000.B }
|
||||
withLabel: mem100tb { memory = 100000000000000.B }
|
||||
withLabel: mem200tb { memory = 200000000000000.B }
|
||||
withLabel: mem500tb { memory = 500000000000000.B }
|
||||
withLabel: mem1gib { memory = 1073741824.B }
|
||||
withLabel: mem2gib { memory = 2147483648.B }
|
||||
withLabel: mem4gib { memory = 4294967296.B }
|
||||
withLabel: mem8gib { memory = 8589934592.B }
|
||||
withLabel: mem16gib { memory = 17179869184.B }
|
||||
withLabel: mem32gib { memory = 34359738368.B }
|
||||
withLabel: mem64gib { memory = 68719476736.B }
|
||||
withLabel: mem128gib { memory = 137438953472.B }
|
||||
withLabel: mem256gib { memory = 274877906944.B }
|
||||
withLabel: mem512gib { memory = 549755813888.B }
|
||||
withLabel: mem1tib { memory = 1099511627776.B }
|
||||
withLabel: mem2tib { memory = 2199023255552.B }
|
||||
withLabel: mem4tib { memory = 4398046511104.B }
|
||||
withLabel: mem8tib { memory = 8796093022208.B }
|
||||
withLabel: mem16tib { memory = 17592186044416.B }
|
||||
withLabel: mem32tib { memory = 35184372088832.B }
|
||||
withLabel: mem64tib { memory = 70368744177664.B }
|
||||
withLabel: mem128tib { memory = 140737488355328.B }
|
||||
withLabel: mem256tib { memory = 281474976710656.B }
|
||||
withLabel: mem512tib { memory = 562949953421312.B }
|
||||
withLabel: cpu1 { cpus = 1 }
|
||||
withLabel: cpu2 { cpus = 2 }
|
||||
withLabel: cpu5 { cpus = 5 }
|
||||
withLabel: cpu10 { cpus = 10 }
|
||||
withLabel: cpu20 { cpus = 20 }
|
||||
withLabel: cpu50 { cpus = 50 }
|
||||
withLabel: cpu100 { cpus = 100 }
|
||||
withLabel: cpu200 { cpus = 200 }
|
||||
withLabel: cpu500 { cpus = 500 }
|
||||
withLabel: cpu1000 { cpus = 1000 }
|
||||
}
|
||||
|
||||
includeConfig("nextflow_labels.config")
|
||||
@@ -0,0 +1,66 @@
|
||||
process {
|
||||
// Default resources for components that hardly do any processing
|
||||
memory = { 2.GB * task.attempt }
|
||||
cpus = 1
|
||||
|
||||
// Retry for exit codes that have something to do with memory issues
|
||||
errorStrategy = { task.exitStatus in 137..140 ? 'retry' : 'terminate' }
|
||||
maxRetries = 3
|
||||
maxMemory = null
|
||||
|
||||
// CPU resources
|
||||
withLabel: singlecpu { cpus = 1 }
|
||||
withLabel: lowcpu { cpus = 4 }
|
||||
withLabel: midcpu { cpus = 10 }
|
||||
withLabel: highcpu { cpus = 20 }
|
||||
|
||||
// Memory resources
|
||||
withLabel: lowmem { memory = { get_memory( 4.GB * task.attempt ) } }
|
||||
withLabel: midmem { memory = { get_memory( 25.GB * task.attempt ) } }
|
||||
withLabel: highmem { memory = { get_memory( 50.GB * task.attempt ) } }
|
||||
withLabel: veryhighmem { memory = { get_memory( 75.GB * task.attempt ) } }
|
||||
|
||||
// Disk space
|
||||
withLabel: lowdisk {
|
||||
disk = {process.disk ? process.disk : null}
|
||||
}
|
||||
withLabel: middisk {
|
||||
disk = {process.disk ? process.disk : null}
|
||||
}
|
||||
withLabel: highdisk {
|
||||
disk = {process.disk ? process.disk : null}
|
||||
}
|
||||
withLabel: veryhighdisk {
|
||||
disk = {process.disk ? process.disk : null}
|
||||
}
|
||||
|
||||
// NOTE: The above labels intentionally do not have an effect by default.
|
||||
// The user should set the disk space requirements by adding the following
|
||||
// to the compute environment:
|
||||
//
|
||||
// withLabel: lowdisk { disk = { 20.GB * task.attempt } }
|
||||
// withLabel: middisk { disk = { 100.GB * task.attempt } }
|
||||
// withLabel: highdisk { disk = { 200.GB * task.attempt } }
|
||||
// withLabel: veryhighdisk { disk = { 500.GB * task.attempt } }
|
||||
}
|
||||
|
||||
def get_memory(to_compare) {
|
||||
if (!process.containsKey("maxMemory") || !process.maxMemory) {
|
||||
return to_compare
|
||||
}
|
||||
|
||||
try {
|
||||
if (process.containsKey("maxRetries") && process.maxRetries && task.attempt == (process.maxRetries as int)) {
|
||||
return process.maxMemory
|
||||
}
|
||||
else if (to_compare.compareTo(process.maxMemory as nextflow.util.MemoryUnit) == 1) {
|
||||
return max_memory as nextflow.util.MemoryUnit
|
||||
}
|
||||
else {
|
||||
return to_compare
|
||||
}
|
||||
} catch (all) {
|
||||
println "Error processing memory resources. Please check that process.maxMemory '${process.maxMemory}' and process.maxRetries '${process.maxRetries}' are valid!"
|
||||
System.exit(1)
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,27 @@
|
||||
# Inputs
|
||||
input: # please fill in - example: "input.h5mu"
|
||||
modality: "rna"
|
||||
# layer: "raw_counts"
|
||||
|
||||
# Metrics added to .obs
|
||||
# var_qc_metrics: ["ercc,highly_variable,mitochondrial"]
|
||||
# var_qc_metrics_fill_na_value: true
|
||||
# top_n_vars: [123]
|
||||
output_obs_num_nonzero_vars: "num_nonzero_vars"
|
||||
output_obs_total_counts_vars: "total_counts"
|
||||
|
||||
# Metrics added to .var
|
||||
output_var_num_nonzero_obs: "num_nonzero_obs"
|
||||
output_var_total_counts_obs: "total_counts"
|
||||
output_var_obs_mean: "obs_mean"
|
||||
output_var_pct_dropout: "pct_dropout"
|
||||
|
||||
# Outputs
|
||||
# output: "$id.$key.output.h5mu"
|
||||
# output_compression: "gzip"
|
||||
|
||||
# Nextflow input-output arguments
|
||||
publish_dir: # please fill in - example: "output/"
|
||||
# param_list: "my_params.yaml"
|
||||
|
||||
# Arguments
|
||||
@@ -0,0 +1,259 @@
|
||||
{
|
||||
"$schema": "http://json-schema.org/draft-07/schema",
|
||||
"title": "calculate_qc_metrics",
|
||||
"description": "Add basic quality control metrics to an .h5mu file.\n\nThe metrics are comparable to what scanpy.pp.calculate_qc_metrics output,\nalthough they have slightly different names:\n\nVar metrics (name in this component -\u003e name in scanpy):\n - pct_dropout -\u003e pct_dropout_by_{expr_type}\n - num_nonzero_obs -\u003e n_cells_by_{expr_type}\n - obs_mean -\u003e mean_{expr_type}\n - total_counts -\u003e total_{expr_type}\n\n Obs metrics:\n - num_nonzero_vars -\u003e n_genes_by_{expr_type}\n - pct_{var_qc_metrics} -\u003e pct_{expr_type}_{qc_var}\n - total_counts_{var_qc_metrics} -\u003e total_{expr_type}_{qc_var}\n - pct_of_counts_in_top_{top_n_vars}_vars -\u003e pct_{expr_type}_in_top_{n}_{var_type}\n - total_counts -\u003e total_{expr_type}\n \n",
|
||||
"type": "object",
|
||||
"definitions": {
|
||||
|
||||
|
||||
"Dataset input": {
|
||||
"title": "Dataset input",
|
||||
"type": "object",
|
||||
"description": "Dataset input using nf-tower \"dataset\" or \"data explorer\". Allows for the input of multiple parameter sets to initialise a Nextflow channel.",
|
||||
"properties": {
|
||||
"param_list": {
|
||||
"description": "Dataset input can either be a list of maps, a csv file, a json file, a yaml file, or simply a yaml blob. The names of the input fields (e.g. csv columns, json keys) need to be an exact match with the workflow input parameters.",
|
||||
"default": "",
|
||||
"format": "file-path",
|
||||
"mimetype": "text/csv",
|
||||
"pattern": "^\\S+\\.csv$"
|
||||
}
|
||||
}
|
||||
},
|
||||
|
||||
|
||||
|
||||
"inputs" : {
|
||||
"title": "Inputs",
|
||||
"type": "object",
|
||||
"description": "No description",
|
||||
"properties": {
|
||||
|
||||
|
||||
"input": {
|
||||
"type":
|
||||
"string",
|
||||
"description": "Type: `file`, required, example: `input.h5mu`. Input h5mu file",
|
||||
"help_text": "Type: `file`, required, example: `input.h5mu`. Input h5mu file"
|
||||
|
||||
}
|
||||
|
||||
|
||||
,
|
||||
"modality": {
|
||||
"type":
|
||||
"string",
|
||||
"description": "Type: `string`, default: `rna`. ",
|
||||
"help_text": "Type: `string`, default: `rna`. "
|
||||
,
|
||||
"default":"rna"
|
||||
}
|
||||
|
||||
|
||||
,
|
||||
"layer": {
|
||||
"type":
|
||||
"string",
|
||||
"description": "Type: `string`, example: `raw_counts`. ",
|
||||
"help_text": "Type: `string`, example: `raw_counts`. "
|
||||
|
||||
}
|
||||
|
||||
|
||||
}
|
||||
},
|
||||
|
||||
|
||||
"outputs" : {
|
||||
"title": "Outputs",
|
||||
"type": "object",
|
||||
"description": "No description",
|
||||
"properties": {
|
||||
|
||||
|
||||
"output": {
|
||||
"type":
|
||||
"string",
|
||||
"description": "Type: `file`, default: `$id.$key.output.h5mu`, example: `output.h5mu`. Output h5mu file",
|
||||
"help_text": "Type: `file`, default: `$id.$key.output.h5mu`, example: `output.h5mu`. Output h5mu file."
|
||||
,
|
||||
"default":"$id.$key.output.h5mu"
|
||||
}
|
||||
|
||||
|
||||
,
|
||||
"output_compression": {
|
||||
"type":
|
||||
"string",
|
||||
"description": "Type: `string`, example: `gzip`, choices: ``gzip`, `lzf``. The compression format to be used on the output h5mu object",
|
||||
"help_text": "Type: `string`, example: `gzip`, choices: ``gzip`, `lzf``. The compression format to be used on the output h5mu object.",
|
||||
"enum": ["gzip", "lzf"]
|
||||
|
||||
|
||||
}
|
||||
|
||||
|
||||
}
|
||||
},
|
||||
|
||||
|
||||
"metrics added to .obs" : {
|
||||
"title": "Metrics added to .obs",
|
||||
"type": "object",
|
||||
"description": "No description",
|
||||
"properties": {
|
||||
|
||||
|
||||
"var_qc_metrics": {
|
||||
"type":
|
||||
"string",
|
||||
"description": "Type: List of `string`, example: `ercc,highly_variable,mitochondrial`, multiple_sep: `\";\"`. Keys to select a boolean (containing only True or False) column from ",
|
||||
"help_text": "Type: List of `string`, example: `ercc,highly_variable,mitochondrial`, multiple_sep: `\";\"`. Keys to select a boolean (containing only True or False) column from .var.\nFor each cell, calculate the proportion of total values for genes which are labeled \u0027True\u0027, \ncompared to the total sum of the values for all genes.\n"
|
||||
|
||||
}
|
||||
|
||||
|
||||
,
|
||||
"var_qc_metrics_fill_na_value": {
|
||||
"type":
|
||||
"boolean",
|
||||
"description": "Type: `boolean`. Fill any \u0027NA\u0027 values found in the columns specified with --var_qc_metrics to \u0027True\u0027 or \u0027False\u0027",
|
||||
"help_text": "Type: `boolean`. Fill any \u0027NA\u0027 values found in the columns specified with --var_qc_metrics to \u0027True\u0027 or \u0027False\u0027.\nas False.\n"
|
||||
|
||||
}
|
||||
|
||||
|
||||
,
|
||||
"top_n_vars": {
|
||||
"type":
|
||||
"string",
|
||||
"description": "Type: List of `integer`, multiple_sep: `\";\"`. Number of top vars to be used to calculate cumulative proportions",
|
||||
"help_text": "Type: List of `integer`, multiple_sep: `\";\"`. Number of top vars to be used to calculate cumulative proportions.\nIf not specified, proportions are not calculated. `--top_n_vars 20;50` finds\ncumulative proportion to the 20th and 50th most expressed vars.\n"
|
||||
|
||||
}
|
||||
|
||||
|
||||
,
|
||||
"output_obs_num_nonzero_vars": {
|
||||
"type":
|
||||
"string",
|
||||
"description": "Type: `string`, default: `num_nonzero_vars`. Name of column in ",
|
||||
"help_text": "Type: `string`, default: `num_nonzero_vars`. Name of column in .obs describing, for each observation, the number of stored values\n(including explicit zeroes). In other words, the name of the column that counts\nfor each row the number of columns that contain data.\n"
|
||||
,
|
||||
"default":"num_nonzero_vars"
|
||||
}
|
||||
|
||||
|
||||
,
|
||||
"output_obs_total_counts_vars": {
|
||||
"type":
|
||||
"string",
|
||||
"description": "Type: `string`, default: `total_counts`. Name of the column for ",
|
||||
"help_text": "Type: `string`, default: `total_counts`. Name of the column for .obs describing, for each observation (row),\nthe sum of the stored values in the columns.\n"
|
||||
,
|
||||
"default":"total_counts"
|
||||
}
|
||||
|
||||
|
||||
}
|
||||
},
|
||||
|
||||
|
||||
"metrics added to .var" : {
|
||||
"title": "Metrics added to .var",
|
||||
"type": "object",
|
||||
"description": "No description",
|
||||
"properties": {
|
||||
|
||||
|
||||
"output_var_num_nonzero_obs": {
|
||||
"type":
|
||||
"string",
|
||||
"description": "Type: `string`, default: `num_nonzero_obs`. Name of column describing, for each feature, the number of stored values\n(including explicit zeroes)",
|
||||
"help_text": "Type: `string`, default: `num_nonzero_obs`. Name of column describing, for each feature, the number of stored values\n(including explicit zeroes). In other words, the name of the column that counts\nfor each column the number of rows that contain data.\n"
|
||||
,
|
||||
"default":"num_nonzero_obs"
|
||||
}
|
||||
|
||||
|
||||
,
|
||||
"output_var_total_counts_obs": {
|
||||
"type":
|
||||
"string",
|
||||
"description": "Type: `string`, default: `total_counts`. Name of the column in ",
|
||||
"help_text": "Type: `string`, default: `total_counts`. Name of the column in .var describing, for each feature (column),\nthe sum of the stored values in the rows.\n"
|
||||
,
|
||||
"default":"total_counts"
|
||||
}
|
||||
|
||||
|
||||
,
|
||||
"output_var_obs_mean": {
|
||||
"type":
|
||||
"string",
|
||||
"description": "Type: `string`, default: `obs_mean`. Name of the column in ",
|
||||
"help_text": "Type: `string`, default: `obs_mean`. Name of the column in .obs providing the mean of the values in each row.\n"
|
||||
,
|
||||
"default":"obs_mean"
|
||||
}
|
||||
|
||||
|
||||
,
|
||||
"output_var_pct_dropout": {
|
||||
"type":
|
||||
"string",
|
||||
"description": "Type: `string`, default: `pct_dropout`. Name of the column in ",
|
||||
"help_text": "Type: `string`, default: `pct_dropout`. Name of the column in .obs providing for each feature the percentage of\nobservations the feature does not appear on (i.e. is missing). Same as `--num_nonzero_obs`\nbut percentage based.\n"
|
||||
,
|
||||
"default":"pct_dropout"
|
||||
}
|
||||
|
||||
|
||||
}
|
||||
},
|
||||
|
||||
|
||||
"nextflow input-output arguments" : {
|
||||
"title": "Nextflow input-output arguments",
|
||||
"type": "object",
|
||||
"description": "Input/output parameters for Nextflow itself. Please note that both publishDir and publish_dir are supported but at least one has to be configured.",
|
||||
"properties": {
|
||||
|
||||
|
||||
"publish_dir": {
|
||||
"type":
|
||||
"string",
|
||||
"description": "Type: `string`, required, example: `output/`. Path to an output directory",
|
||||
"help_text": "Type: `string`, required, example: `output/`. Path to an output directory."
|
||||
|
||||
}
|
||||
|
||||
|
||||
|
||||
|
||||
}
|
||||
}
|
||||
},
|
||||
"allOf": [
|
||||
|
||||
{
|
||||
"$ref": "#/definitions/inputs"
|
||||
},
|
||||
|
||||
{
|
||||
"$ref": "#/definitions/outputs"
|
||||
},
|
||||
|
||||
{
|
||||
"$ref": "#/definitions/metrics added to .obs"
|
||||
},
|
||||
|
||||
{
|
||||
"$ref": "#/definitions/metrics added to .var"
|
||||
},
|
||||
|
||||
{
|
||||
"$ref": "#/definitions/nextflow input-output arguments"
|
||||
}
|
||||
]
|
||||
}
|
||||
@@ -0,0 +1,12 @@
|
||||
def setup_logger():
|
||||
import logging
|
||||
from sys import stdout
|
||||
|
||||
logger = logging.getLogger()
|
||||
logger.setLevel(logging.INFO)
|
||||
console_handler = logging.StreamHandler(stdout)
|
||||
logFormatter = logging.Formatter("%(asctime)s %(levelname)-8s %(message)s")
|
||||
console_handler.setFormatter(logFormatter)
|
||||
logger.addHandler(console_handler)
|
||||
|
||||
return logger
|
||||
@@ -0,0 +1,406 @@
|
||||
name: "qc"
|
||||
namespace: "workflows/qc"
|
||||
version: "2.1.2"
|
||||
authors:
|
||||
- name: "Dries Schaumont"
|
||||
roles:
|
||||
- "author"
|
||||
- "maintainer"
|
||||
info:
|
||||
role: "Core Team Member"
|
||||
links:
|
||||
email: "dries@data-intuitive.com"
|
||||
github: "DriesSchaumont"
|
||||
orcid: "0000-0002-4389-0440"
|
||||
linkedin: "dries-schaumont"
|
||||
organizations:
|
||||
- name: "Data Intuitive"
|
||||
href: "https://www.data-intuitive.com"
|
||||
role: "Data Scientist"
|
||||
argument_groups:
|
||||
- name: "Inputs"
|
||||
arguments:
|
||||
- type: "string"
|
||||
name: "--id"
|
||||
description: "ID of the sample."
|
||||
info: null
|
||||
example:
|
||||
- "foo"
|
||||
required: true
|
||||
direction: "input"
|
||||
multiple: false
|
||||
multiple_sep: ";"
|
||||
- type: "file"
|
||||
name: "--input"
|
||||
alternatives:
|
||||
- "-i"
|
||||
description: "Path to the sample."
|
||||
info: null
|
||||
example:
|
||||
- "input.h5mu"
|
||||
must_exist: true
|
||||
create_parent: true
|
||||
required: true
|
||||
direction: "input"
|
||||
multiple: false
|
||||
multiple_sep: ";"
|
||||
- type: "string"
|
||||
name: "--modality"
|
||||
description: "Which modality to process."
|
||||
info: null
|
||||
default:
|
||||
- "rna"
|
||||
required: false
|
||||
direction: "input"
|
||||
multiple: false
|
||||
multiple_sep: ";"
|
||||
- type: "string"
|
||||
name: "--layer"
|
||||
description: "Layer to calculate qc metrics for."
|
||||
info: null
|
||||
example:
|
||||
- "raw_counts"
|
||||
required: false
|
||||
direction: "input"
|
||||
multiple: false
|
||||
multiple_sep: ";"
|
||||
- name: "Mitochondrial & Ribosomal Gene Detection"
|
||||
arguments:
|
||||
- type: "string"
|
||||
name: "--var_gene_names"
|
||||
description: ".var column name to be used to detect mitochondrial/ribosomal genes\
|
||||
\ instead of .var_names (default if not set).\nGene names matching with the\
|
||||
\ regex value from --mitochondrial_gene_regex or --ribosomal_gene_regex will\
|
||||
\ be \nidentified as mitochondrial or ribosomal genes, respectively.\n"
|
||||
info: null
|
||||
example:
|
||||
- "gene_symbol"
|
||||
required: false
|
||||
direction: "input"
|
||||
multiple: false
|
||||
multiple_sep: ";"
|
||||
- type: "string"
|
||||
name: "--var_name_mitochondrial_genes"
|
||||
description: "In which .var slot to store a boolean array corresponding the mitochondrial\
|
||||
\ genes.\n"
|
||||
info: null
|
||||
required: false
|
||||
direction: "input"
|
||||
multiple: false
|
||||
multiple_sep: ";"
|
||||
- type: "string"
|
||||
name: "--obs_name_mitochondrial_fraction"
|
||||
description: ".Obs slot to store the fraction of reads found to be mitochondrial.\
|
||||
\ Defaults to 'fraction_' suffixed by the value of --var_name_mitochondrial_genes\n"
|
||||
info: null
|
||||
required: false
|
||||
direction: "input"
|
||||
multiple: false
|
||||
multiple_sep: ";"
|
||||
- type: "string"
|
||||
name: "--mitochondrial_gene_regex"
|
||||
description: "Regex string that identifies mitochondrial genes from --var_gene_names.\n\
|
||||
By default will detect human and mouse mitochondrial genes from a gene symbol.\n"
|
||||
info: null
|
||||
default:
|
||||
- "^[mM][tT]-"
|
||||
required: false
|
||||
direction: "input"
|
||||
multiple: false
|
||||
multiple_sep: ";"
|
||||
- type: "string"
|
||||
name: "--var_name_ribosomal_genes"
|
||||
description: "In which .var slot to store a boolean array corresponding the ribosomal\
|
||||
\ genes.\n"
|
||||
info: null
|
||||
required: false
|
||||
direction: "input"
|
||||
multiple: false
|
||||
multiple_sep: ";"
|
||||
- type: "string"
|
||||
name: "--obs_name_ribosomal_fraction"
|
||||
description: "When specified, write the fraction of counts originating from ribosomal\
|
||||
\ genes \n(based on --ribosomal_gene_regex) to an .obs column with the specified\
|
||||
\ name.\nRequires --var_name_ribosomal_genes.\n"
|
||||
info: null
|
||||
required: false
|
||||
direction: "input"
|
||||
multiple: false
|
||||
multiple_sep: ";"
|
||||
- type: "string"
|
||||
name: "--ribosomal_gene_regex"
|
||||
description: "Regex string that identifies ribosomal genes from --var_gene_names.\n\
|
||||
By default will detect human and mouse ribosomal genes from a gene symbol.\n"
|
||||
info: null
|
||||
default:
|
||||
- "^[Mm]?[Rr][Pp][LlSs]"
|
||||
required: false
|
||||
direction: "input"
|
||||
multiple: false
|
||||
multiple_sep: ";"
|
||||
- name: "QC metrics calculation options"
|
||||
arguments:
|
||||
- type: "string"
|
||||
name: "--var_qc_metrics"
|
||||
description: "Keys to select a boolean (containing only True or False) column\
|
||||
\ from .var.\nFor each cell, calculate the proportion of total values for genes\
|
||||
\ which are labeled 'True', \ncompared to the total sum of the values for all\
|
||||
\ genes. Defaults to the value from\n--var_name_mitochondrial_genes.\n"
|
||||
info: null
|
||||
example:
|
||||
- "ercc,highly_variable"
|
||||
required: false
|
||||
direction: "input"
|
||||
multiple: true
|
||||
multiple_sep: ","
|
||||
- type: "integer"
|
||||
name: "--top_n_vars"
|
||||
description: "Number of top vars to be used to calculate cumulative proportions.\n\
|
||||
If not specified, proportions are not calculated. `--top_n_vars 20,50` finds\n\
|
||||
cumulative proportion to the 20th and 50th most expressed vars.\n"
|
||||
info: null
|
||||
default:
|
||||
- 50
|
||||
- 100
|
||||
- 200
|
||||
- 500
|
||||
required: false
|
||||
direction: "input"
|
||||
multiple: true
|
||||
multiple_sep: ","
|
||||
- type: "string"
|
||||
name: "--output_obs_num_nonzero_vars"
|
||||
description: "Name of column in .obs describing, for each observation, the number\
|
||||
\ of stored values\n(including explicit zeroes). In other words, the name of\
|
||||
\ the column that counts\nfor each row the number of columns that contain data.\n"
|
||||
info: null
|
||||
default:
|
||||
- "num_nonzero_vars"
|
||||
required: false
|
||||
direction: "input"
|
||||
multiple: false
|
||||
multiple_sep: ";"
|
||||
- type: "string"
|
||||
name: "--output_obs_total_counts_vars"
|
||||
description: "Name of the column for .obs describing, for each observation (row),\n\
|
||||
the sum of the stored values in the columns.\n"
|
||||
info: null
|
||||
default:
|
||||
- "total_counts"
|
||||
required: false
|
||||
direction: "input"
|
||||
multiple: false
|
||||
multiple_sep: ";"
|
||||
- type: "string"
|
||||
name: "--output_var_num_nonzero_obs"
|
||||
description: "Name of column describing, for each feature, the number of stored\
|
||||
\ values\n(including explicit zeroes). In other words, the name of the column\
|
||||
\ that counts\nfor each column the number of rows that contain data.\n"
|
||||
info: null
|
||||
default:
|
||||
- "num_nonzero_obs"
|
||||
required: false
|
||||
direction: "input"
|
||||
multiple: false
|
||||
multiple_sep: ";"
|
||||
- type: "string"
|
||||
name: "--output_var_total_counts_obs"
|
||||
description: "Name of the column in .var describing, for each feature (column),\n\
|
||||
the sum of the stored values in the rows.\n"
|
||||
info: null
|
||||
default:
|
||||
- "total_counts"
|
||||
required: false
|
||||
direction: "input"
|
||||
multiple: false
|
||||
multiple_sep: ";"
|
||||
- type: "string"
|
||||
name: "--output_var_obs_mean"
|
||||
description: "Name of the column in .obs providing the mean of the values in each\
|
||||
\ row.\n"
|
||||
info: null
|
||||
default:
|
||||
- "obs_mean"
|
||||
required: false
|
||||
direction: "input"
|
||||
multiple: false
|
||||
multiple_sep: ";"
|
||||
- type: "string"
|
||||
name: "--output_var_pct_dropout"
|
||||
description: "Name of the column in .obs providing for each feature the percentage\
|
||||
\ of\nobservations the feature does not appear on (i.e. is missing). Same as\
|
||||
\ `--output_var_num_nonzero_obs`\nbut percentage based.\n"
|
||||
info: null
|
||||
default:
|
||||
- "pct_dropout"
|
||||
required: false
|
||||
direction: "input"
|
||||
multiple: false
|
||||
multiple_sep: ";"
|
||||
- name: "Outputs"
|
||||
arguments:
|
||||
- type: "file"
|
||||
name: "--output"
|
||||
description: "Destination path to the output."
|
||||
info: null
|
||||
example:
|
||||
- "output.h5mu"
|
||||
must_exist: true
|
||||
create_parent: true
|
||||
required: true
|
||||
direction: "output"
|
||||
multiple: false
|
||||
multiple_sep: ";"
|
||||
resources:
|
||||
- type: "nextflow_script"
|
||||
path: "main.nf"
|
||||
is_executable: true
|
||||
entrypoint: "run_wf"
|
||||
- type: "file"
|
||||
path: "utils"
|
||||
- type: "file"
|
||||
path: "nextflow_labels.config"
|
||||
dest: "nextflow_labels.config"
|
||||
description: "A pipeline to add basic qc statistics to a MuData "
|
||||
test_resources:
|
||||
- type: "nextflow_script"
|
||||
path: "test.nf"
|
||||
is_executable: true
|
||||
entrypoint: "test_wf"
|
||||
- type: "file"
|
||||
path: "concat_test_data"
|
||||
- type: "file"
|
||||
path: "pbmc_1k_protein_v3"
|
||||
info:
|
||||
test_dependencies:
|
||||
- name: "qc_test"
|
||||
namespace: "test_workflows/qc"
|
||||
status: "enabled"
|
||||
scope:
|
||||
image: "public"
|
||||
target: "public"
|
||||
dependencies:
|
||||
- name: "metadata/grep_annotation_column"
|
||||
repository:
|
||||
type: "local"
|
||||
- name: "qc/calculate_qc_metrics"
|
||||
repository:
|
||||
type: "local"
|
||||
license: "MIT"
|
||||
links:
|
||||
repository: "https://github.com/openpipelines-bio/openpipeline"
|
||||
docker_registry: "ghcr.io"
|
||||
runners:
|
||||
- type: "nextflow"
|
||||
id: "nextflow"
|
||||
directives:
|
||||
tag: "$id"
|
||||
auto:
|
||||
simplifyInput: true
|
||||
simplifyOutput: false
|
||||
transcript: false
|
||||
publish: false
|
||||
config:
|
||||
labels:
|
||||
mem1gb: "memory = 1000000000.B"
|
||||
mem2gb: "memory = 2000000000.B"
|
||||
mem5gb: "memory = 5000000000.B"
|
||||
mem10gb: "memory = 10000000000.B"
|
||||
mem20gb: "memory = 20000000000.B"
|
||||
mem50gb: "memory = 50000000000.B"
|
||||
mem100gb: "memory = 100000000000.B"
|
||||
mem200gb: "memory = 200000000000.B"
|
||||
mem500gb: "memory = 500000000000.B"
|
||||
mem1tb: "memory = 1000000000000.B"
|
||||
mem2tb: "memory = 2000000000000.B"
|
||||
mem5tb: "memory = 5000000000000.B"
|
||||
mem10tb: "memory = 10000000000000.B"
|
||||
mem20tb: "memory = 20000000000000.B"
|
||||
mem50tb: "memory = 50000000000000.B"
|
||||
mem100tb: "memory = 100000000000000.B"
|
||||
mem200tb: "memory = 200000000000000.B"
|
||||
mem500tb: "memory = 500000000000000.B"
|
||||
mem1gib: "memory = 1073741824.B"
|
||||
mem2gib: "memory = 2147483648.B"
|
||||
mem4gib: "memory = 4294967296.B"
|
||||
mem8gib: "memory = 8589934592.B"
|
||||
mem16gib: "memory = 17179869184.B"
|
||||
mem32gib: "memory = 34359738368.B"
|
||||
mem64gib: "memory = 68719476736.B"
|
||||
mem128gib: "memory = 137438953472.B"
|
||||
mem256gib: "memory = 274877906944.B"
|
||||
mem512gib: "memory = 549755813888.B"
|
||||
mem1tib: "memory = 1099511627776.B"
|
||||
mem2tib: "memory = 2199023255552.B"
|
||||
mem4tib: "memory = 4398046511104.B"
|
||||
mem8tib: "memory = 8796093022208.B"
|
||||
mem16tib: "memory = 17592186044416.B"
|
||||
mem32tib: "memory = 35184372088832.B"
|
||||
mem64tib: "memory = 70368744177664.B"
|
||||
mem128tib: "memory = 140737488355328.B"
|
||||
mem256tib: "memory = 281474976710656.B"
|
||||
mem512tib: "memory = 562949953421312.B"
|
||||
cpu1: "cpus = 1"
|
||||
cpu2: "cpus = 2"
|
||||
cpu5: "cpus = 5"
|
||||
cpu10: "cpus = 10"
|
||||
cpu20: "cpus = 20"
|
||||
cpu50: "cpus = 50"
|
||||
cpu100: "cpus = 100"
|
||||
cpu200: "cpus = 200"
|
||||
cpu500: "cpus = 500"
|
||||
cpu1000: "cpus = 1000"
|
||||
script:
|
||||
- "includeConfig(\"nextflow_labels.config\")"
|
||||
debug: false
|
||||
container: "docker"
|
||||
build_info:
|
||||
config: "src/workflows/qc/qc/config.vsh.yaml"
|
||||
runner: "nextflow"
|
||||
engine: "native"
|
||||
output: "target/nextflow/workflows/qc/qc"
|
||||
executable: "target/nextflow/workflows/qc/qc/main.nf"
|
||||
viash_version: "0.9.4"
|
||||
git_commit: "a0c9522486585774f76416150f8a3291409b5363"
|
||||
git_remote: "https://github.com/openpipelines-bio/openpipeline"
|
||||
git_tag: "2.1.1-2-ga0c95224865"
|
||||
dependencies:
|
||||
- "target/nextflow/metadata/grep_annotation_column"
|
||||
- "target/nextflow/qc/calculate_qc_metrics"
|
||||
package_config:
|
||||
name: "openpipeline"
|
||||
summary: "Best-practice workflows for single-cell multi-omics analyses.\n"
|
||||
description: "OpenPipelines are extensible single cell analysis pipelines for reproducible\
|
||||
\ and large-scale single cell processing using [Viash](https://viash.io) and [Nextflow](https://www.nextflow.io/).\n\
|
||||
\nIn terms of workflows, the following has been made available, but keep in mind\
|
||||
\ that\nindividual tools and functionality can be executed as standalone components\
|
||||
\ as well.\n\n * Demultiplexing: conversion of raw sequencing data to FASTQ objects.\n\
|
||||
\ * Ingestion: Read mapping and generating a count matrix.\n * Single sample\
|
||||
\ processing: cell filtering and doublet detection.\n * Multisample processing:\
|
||||
\ Count transformation, normalization, QC metric calulations.\n * Integration:\
|
||||
\ Clustering, integration and batch correction using single and multimodal methods.\n\
|
||||
\ * Downstream analysis workflows\n"
|
||||
info:
|
||||
test_resources:
|
||||
- type: "s3"
|
||||
path: "s3://openpipelines-data"
|
||||
dest: "resources_test"
|
||||
viash_version: "0.9.4"
|
||||
source: "src"
|
||||
target: "target"
|
||||
config_mods:
|
||||
- ".resources += {path: '/src/workflows/utils/labels.config', dest: 'nextflow_labels.config'}\n\
|
||||
.runners[.type == 'nextflow'].config.script := 'includeConfig(\"nextflow_labels.config\"\
|
||||
)'"
|
||||
- ".version := \"2.1.2\""
|
||||
keywords:
|
||||
- "single-cell"
|
||||
- "multimodal"
|
||||
license: "MIT"
|
||||
organization: "openpipelines-bio"
|
||||
links:
|
||||
repository: "https://github.com/openpipelines-bio/openpipeline"
|
||||
docker_registry: "ghcr.io"
|
||||
homepage: "https://openpipelines.bio"
|
||||
documentation: "https://openpipelines.bio/fundamentals"
|
||||
issue_tracker: "https://github.com/openpipelines-bio/openpipeline/issues"
|
||||
File diff suppressed because it is too large
Load Diff
@@ -0,0 +1,126 @@
|
||||
manifest {
|
||||
name = 'workflows/qc/qc'
|
||||
mainScript = 'main.nf'
|
||||
nextflowVersion = '!>=20.12.1-edge'
|
||||
version = '2.1.2'
|
||||
description = 'A pipeline to add basic qc statistics to a MuData '
|
||||
author = 'Dries Schaumont'
|
||||
}
|
||||
|
||||
process.container = 'nextflow/bash:latest'
|
||||
|
||||
// detect tempdir
|
||||
tempDir = java.nio.file.Paths.get(
|
||||
System.getenv('NXF_TEMP') ?:
|
||||
System.getenv('VIASH_TEMP') ?:
|
||||
System.getenv('TEMPDIR') ?:
|
||||
System.getenv('TMPDIR') ?:
|
||||
'/tmp'
|
||||
).toAbsolutePath()
|
||||
|
||||
profiles {
|
||||
no_publish {
|
||||
process {
|
||||
withName: '.*' {
|
||||
publishDir = [
|
||||
enabled: false
|
||||
]
|
||||
}
|
||||
}
|
||||
}
|
||||
mount_temp {
|
||||
docker.temp = tempDir
|
||||
podman.temp = tempDir
|
||||
charliecloud.temp = tempDir
|
||||
}
|
||||
docker {
|
||||
docker.enabled = true
|
||||
// docker.userEmulation = true
|
||||
singularity.enabled = false
|
||||
podman.enabled = false
|
||||
shifter.enabled = false
|
||||
charliecloud.enabled = false
|
||||
}
|
||||
singularity {
|
||||
singularity.enabled = true
|
||||
singularity.autoMounts = true
|
||||
docker.enabled = false
|
||||
podman.enabled = false
|
||||
shifter.enabled = false
|
||||
charliecloud.enabled = false
|
||||
}
|
||||
podman {
|
||||
podman.enabled = true
|
||||
docker.enabled = false
|
||||
singularity.enabled = false
|
||||
shifter.enabled = false
|
||||
charliecloud.enabled = false
|
||||
}
|
||||
shifter {
|
||||
shifter.enabled = true
|
||||
docker.enabled = false
|
||||
singularity.enabled = false
|
||||
podman.enabled = false
|
||||
charliecloud.enabled = false
|
||||
}
|
||||
charliecloud {
|
||||
charliecloud.enabled = true
|
||||
docker.enabled = false
|
||||
singularity.enabled = false
|
||||
podman.enabled = false
|
||||
shifter.enabled = false
|
||||
}
|
||||
}
|
||||
|
||||
process{
|
||||
withLabel: mem1gb { memory = 1000000000.B }
|
||||
withLabel: mem2gb { memory = 2000000000.B }
|
||||
withLabel: mem5gb { memory = 5000000000.B }
|
||||
withLabel: mem10gb { memory = 10000000000.B }
|
||||
withLabel: mem20gb { memory = 20000000000.B }
|
||||
withLabel: mem50gb { memory = 50000000000.B }
|
||||
withLabel: mem100gb { memory = 100000000000.B }
|
||||
withLabel: mem200gb { memory = 200000000000.B }
|
||||
withLabel: mem500gb { memory = 500000000000.B }
|
||||
withLabel: mem1tb { memory = 1000000000000.B }
|
||||
withLabel: mem2tb { memory = 2000000000000.B }
|
||||
withLabel: mem5tb { memory = 5000000000000.B }
|
||||
withLabel: mem10tb { memory = 10000000000000.B }
|
||||
withLabel: mem20tb { memory = 20000000000000.B }
|
||||
withLabel: mem50tb { memory = 50000000000000.B }
|
||||
withLabel: mem100tb { memory = 100000000000000.B }
|
||||
withLabel: mem200tb { memory = 200000000000000.B }
|
||||
withLabel: mem500tb { memory = 500000000000000.B }
|
||||
withLabel: mem1gib { memory = 1073741824.B }
|
||||
withLabel: mem2gib { memory = 2147483648.B }
|
||||
withLabel: mem4gib { memory = 4294967296.B }
|
||||
withLabel: mem8gib { memory = 8589934592.B }
|
||||
withLabel: mem16gib { memory = 17179869184.B }
|
||||
withLabel: mem32gib { memory = 34359738368.B }
|
||||
withLabel: mem64gib { memory = 68719476736.B }
|
||||
withLabel: mem128gib { memory = 137438953472.B }
|
||||
withLabel: mem256gib { memory = 274877906944.B }
|
||||
withLabel: mem512gib { memory = 549755813888.B }
|
||||
withLabel: mem1tib { memory = 1099511627776.B }
|
||||
withLabel: mem2tib { memory = 2199023255552.B }
|
||||
withLabel: mem4tib { memory = 4398046511104.B }
|
||||
withLabel: mem8tib { memory = 8796093022208.B }
|
||||
withLabel: mem16tib { memory = 17592186044416.B }
|
||||
withLabel: mem32tib { memory = 35184372088832.B }
|
||||
withLabel: mem64tib { memory = 70368744177664.B }
|
||||
withLabel: mem128tib { memory = 140737488355328.B }
|
||||
withLabel: mem256tib { memory = 281474976710656.B }
|
||||
withLabel: mem512tib { memory = 562949953421312.B }
|
||||
withLabel: cpu1 { cpus = 1 }
|
||||
withLabel: cpu2 { cpus = 2 }
|
||||
withLabel: cpu5 { cpus = 5 }
|
||||
withLabel: cpu10 { cpus = 10 }
|
||||
withLabel: cpu20 { cpus = 20 }
|
||||
withLabel: cpu50 { cpus = 50 }
|
||||
withLabel: cpu100 { cpus = 100 }
|
||||
withLabel: cpu200 { cpus = 200 }
|
||||
withLabel: cpu500 { cpus = 500 }
|
||||
withLabel: cpu1000 { cpus = 1000 }
|
||||
}
|
||||
|
||||
includeConfig("nextflow_labels.config")
|
||||
@@ -0,0 +1,66 @@
|
||||
process {
|
||||
// Default resources for components that hardly do any processing
|
||||
memory = { 2.GB * task.attempt }
|
||||
cpus = 1
|
||||
|
||||
// Retry for exit codes that have something to do with memory issues
|
||||
errorStrategy = { task.exitStatus in 137..140 ? 'retry' : 'terminate' }
|
||||
maxRetries = 3
|
||||
maxMemory = null
|
||||
|
||||
// CPU resources
|
||||
withLabel: singlecpu { cpus = 1 }
|
||||
withLabel: lowcpu { cpus = 4 }
|
||||
withLabel: midcpu { cpus = 10 }
|
||||
withLabel: highcpu { cpus = 20 }
|
||||
|
||||
// Memory resources
|
||||
withLabel: lowmem { memory = { get_memory( 4.GB * task.attempt ) } }
|
||||
withLabel: midmem { memory = { get_memory( 25.GB * task.attempt ) } }
|
||||
withLabel: highmem { memory = { get_memory( 50.GB * task.attempt ) } }
|
||||
withLabel: veryhighmem { memory = { get_memory( 75.GB * task.attempt ) } }
|
||||
|
||||
// Disk space
|
||||
withLabel: lowdisk {
|
||||
disk = {process.disk ? process.disk : null}
|
||||
}
|
||||
withLabel: middisk {
|
||||
disk = {process.disk ? process.disk : null}
|
||||
}
|
||||
withLabel: highdisk {
|
||||
disk = {process.disk ? process.disk : null}
|
||||
}
|
||||
withLabel: veryhighdisk {
|
||||
disk = {process.disk ? process.disk : null}
|
||||
}
|
||||
|
||||
// NOTE: The above labels intentionally do not have an effect by default.
|
||||
// The user should set the disk space requirements by adding the following
|
||||
// to the compute environment:
|
||||
//
|
||||
// withLabel: lowdisk { disk = { 20.GB * task.attempt } }
|
||||
// withLabel: middisk { disk = { 100.GB * task.attempt } }
|
||||
// withLabel: highdisk { disk = { 200.GB * task.attempt } }
|
||||
// withLabel: veryhighdisk { disk = { 500.GB * task.attempt } }
|
||||
}
|
||||
|
||||
def get_memory(to_compare) {
|
||||
if (!process.containsKey("maxMemory") || !process.maxMemory) {
|
||||
return to_compare
|
||||
}
|
||||
|
||||
try {
|
||||
if (process.containsKey("maxRetries") && process.maxRetries && task.attempt == (process.maxRetries as int)) {
|
||||
return process.maxMemory
|
||||
}
|
||||
else if (to_compare.compareTo(process.maxMemory as nextflow.util.MemoryUnit) == 1) {
|
||||
return max_memory as nextflow.util.MemoryUnit
|
||||
}
|
||||
else {
|
||||
return to_compare
|
||||
}
|
||||
} catch (all) {
|
||||
println "Error processing memory resources. Please check that process.maxMemory '${process.maxMemory}' and process.maxRetries '${process.maxRetries}' are valid!"
|
||||
System.exit(1)
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,33 @@
|
||||
# Inputs
|
||||
id: # please fill in - example: "foo"
|
||||
input: # please fill in - example: "input.h5mu"
|
||||
modality: "rna"
|
||||
# layer: "raw_counts"
|
||||
|
||||
# Mitochondrial & Ribosomal Gene Detection
|
||||
# var_gene_names: "gene_symbol"
|
||||
# var_name_mitochondrial_genes: "foo"
|
||||
# obs_name_mitochondrial_fraction: "foo"
|
||||
mitochondrial_gene_regex: "^[mM][tT]-"
|
||||
# var_name_ribosomal_genes: "foo"
|
||||
# obs_name_ribosomal_fraction: "foo"
|
||||
ribosomal_gene_regex: "^[Mm]?[Rr][Pp][LlSs]"
|
||||
|
||||
# QC metrics calculation options
|
||||
# var_qc_metrics: ["ercc,highly_variable"]
|
||||
top_n_vars: [50, 100, 200, 500]
|
||||
output_obs_num_nonzero_vars: "num_nonzero_vars"
|
||||
output_obs_total_counts_vars: "total_counts"
|
||||
output_var_num_nonzero_obs: "num_nonzero_obs"
|
||||
output_var_total_counts_obs: "total_counts"
|
||||
output_var_obs_mean: "obs_mean"
|
||||
output_var_pct_dropout: "pct_dropout"
|
||||
|
||||
# Outputs
|
||||
# output: "$id.$key.output.h5mu"
|
||||
|
||||
# Nextflow input-output arguments
|
||||
publish_dir: # please fill in - example: "output/"
|
||||
# param_list: "my_params.yaml"
|
||||
|
||||
# Arguments
|
||||
@@ -0,0 +1,320 @@
|
||||
{
|
||||
"$schema": "http://json-schema.org/draft-07/schema",
|
||||
"title": "qc",
|
||||
"description": "A pipeline to add basic qc statistics to a MuData ",
|
||||
"type": "object",
|
||||
"definitions": {
|
||||
|
||||
|
||||
"Dataset input": {
|
||||
"title": "Dataset input",
|
||||
"type": "object",
|
||||
"description": "Dataset input using nf-tower \"dataset\" or \"data explorer\". Allows for the input of multiple parameter sets to initialise a Nextflow channel.",
|
||||
"properties": {
|
||||
"param_list": {
|
||||
"description": "Dataset input can either be a list of maps, a csv file, a json file, a yaml file, or simply a yaml blob. The names of the input fields (e.g. csv columns, json keys) need to be an exact match with the workflow input parameters.",
|
||||
"default": "",
|
||||
"format": "file-path",
|
||||
"mimetype": "text/csv",
|
||||
"pattern": "^\\S+\\.csv$"
|
||||
}
|
||||
}
|
||||
},
|
||||
|
||||
|
||||
|
||||
"inputs" : {
|
||||
"title": "Inputs",
|
||||
"type": "object",
|
||||
"description": "No description",
|
||||
"properties": {
|
||||
|
||||
|
||||
"id": {
|
||||
"type":
|
||||
"string",
|
||||
"description": "Type: `string`, required, example: `foo`. ID of the sample",
|
||||
"help_text": "Type: `string`, required, example: `foo`. ID of the sample."
|
||||
|
||||
}
|
||||
|
||||
|
||||
,
|
||||
"input": {
|
||||
"type":
|
||||
"string",
|
||||
"description": "Type: `file`, required, example: `input.h5mu`. Path to the sample",
|
||||
"help_text": "Type: `file`, required, example: `input.h5mu`. Path to the sample."
|
||||
|
||||
}
|
||||
|
||||
|
||||
,
|
||||
"modality": {
|
||||
"type":
|
||||
"string",
|
||||
"description": "Type: `string`, default: `rna`. Which modality to process",
|
||||
"help_text": "Type: `string`, default: `rna`. Which modality to process."
|
||||
,
|
||||
"default":"rna"
|
||||
}
|
||||
|
||||
|
||||
,
|
||||
"layer": {
|
||||
"type":
|
||||
"string",
|
||||
"description": "Type: `string`, example: `raw_counts`. Layer to calculate qc metrics for",
|
||||
"help_text": "Type: `string`, example: `raw_counts`. Layer to calculate qc metrics for."
|
||||
|
||||
}
|
||||
|
||||
|
||||
}
|
||||
},
|
||||
|
||||
|
||||
"outputs" : {
|
||||
"title": "Outputs",
|
||||
"type": "object",
|
||||
"description": "No description",
|
||||
"properties": {
|
||||
|
||||
|
||||
"output": {
|
||||
"type":
|
||||
"string",
|
||||
"description": "Type: `file`, required, default: `$id.$key.output.h5mu`, example: `output.h5mu`. Destination path to the output",
|
||||
"help_text": "Type: `file`, required, default: `$id.$key.output.h5mu`, example: `output.h5mu`. Destination path to the output."
|
||||
,
|
||||
"default":"$id.$key.output.h5mu"
|
||||
}
|
||||
|
||||
|
||||
}
|
||||
},
|
||||
|
||||
|
||||
"mitochondrial & ribosomal gene detection" : {
|
||||
"title": "Mitochondrial & Ribosomal Gene Detection",
|
||||
"type": "object",
|
||||
"description": "No description",
|
||||
"properties": {
|
||||
|
||||
|
||||
"var_gene_names": {
|
||||
"type":
|
||||
"string",
|
||||
"description": "Type: `string`, example: `gene_symbol`. ",
|
||||
"help_text": "Type: `string`, example: `gene_symbol`. .var column name to be used to detect mitochondrial/ribosomal genes instead of .var_names (default if not set).\nGene names matching with the regex value from --mitochondrial_gene_regex or --ribosomal_gene_regex will be \nidentified as mitochondrial or ribosomal genes, respectively.\n"
|
||||
|
||||
}
|
||||
|
||||
|
||||
,
|
||||
"var_name_mitochondrial_genes": {
|
||||
"type":
|
||||
"string",
|
||||
"description": "Type: `string`. In which ",
|
||||
"help_text": "Type: `string`. In which .var slot to store a boolean array corresponding the mitochondrial genes.\n"
|
||||
|
||||
}
|
||||
|
||||
|
||||
,
|
||||
"obs_name_mitochondrial_fraction": {
|
||||
"type":
|
||||
"string",
|
||||
"description": "Type: `string`. ",
|
||||
"help_text": "Type: `string`. .Obs slot to store the fraction of reads found to be mitochondrial. Defaults to \u0027fraction_\u0027 suffixed by the value of --var_name_mitochondrial_genes\n"
|
||||
|
||||
}
|
||||
|
||||
|
||||
,
|
||||
"mitochondrial_gene_regex": {
|
||||
"type":
|
||||
"string",
|
||||
"description": "Type: `string`, default: `^[mM][tT]-`. Regex string that identifies mitochondrial genes from --var_gene_names",
|
||||
"help_text": "Type: `string`, default: `^[mM][tT]-`. Regex string that identifies mitochondrial genes from --var_gene_names.\nBy default will detect human and mouse mitochondrial genes from a gene symbol.\n"
|
||||
,
|
||||
"default":"^[mM][tT]-"
|
||||
}
|
||||
|
||||
|
||||
,
|
||||
"var_name_ribosomal_genes": {
|
||||
"type":
|
||||
"string",
|
||||
"description": "Type: `string`. In which ",
|
||||
"help_text": "Type: `string`. In which .var slot to store a boolean array corresponding the ribosomal genes.\n"
|
||||
|
||||
}
|
||||
|
||||
|
||||
,
|
||||
"obs_name_ribosomal_fraction": {
|
||||
"type":
|
||||
"string",
|
||||
"description": "Type: `string`. When specified, write the fraction of counts originating from ribosomal genes \n(based on --ribosomal_gene_regex) to an ",
|
||||
"help_text": "Type: `string`. When specified, write the fraction of counts originating from ribosomal genes \n(based on --ribosomal_gene_regex) to an .obs column with the specified name.\nRequires --var_name_ribosomal_genes.\n"
|
||||
|
||||
}
|
||||
|
||||
|
||||
,
|
||||
"ribosomal_gene_regex": {
|
||||
"type":
|
||||
"string",
|
||||
"description": "Type: `string`, default: `^[Mm]?[Rr][Pp][LlSs]`. Regex string that identifies ribosomal genes from --var_gene_names",
|
||||
"help_text": "Type: `string`, default: `^[Mm]?[Rr][Pp][LlSs]`. Regex string that identifies ribosomal genes from --var_gene_names.\nBy default will detect human and mouse ribosomal genes from a gene symbol.\n"
|
||||
,
|
||||
"default":"^[Mm]?[Rr][Pp][LlSs]"
|
||||
}
|
||||
|
||||
|
||||
}
|
||||
},
|
||||
|
||||
|
||||
"qc metrics calculation options" : {
|
||||
"title": "QC metrics calculation options",
|
||||
"type": "object",
|
||||
"description": "No description",
|
||||
"properties": {
|
||||
|
||||
|
||||
"var_qc_metrics": {
|
||||
"type":
|
||||
"string",
|
||||
"description": "Type: List of `string`, example: `ercc,highly_variable`, multiple_sep: `\",\"`. Keys to select a boolean (containing only True or False) column from ",
|
||||
"help_text": "Type: List of `string`, example: `ercc,highly_variable`, multiple_sep: `\",\"`. Keys to select a boolean (containing only True or False) column from .var.\nFor each cell, calculate the proportion of total values for genes which are labeled \u0027True\u0027, \ncompared to the total sum of the values for all genes. Defaults to the value from\n--var_name_mitochondrial_genes.\n"
|
||||
|
||||
}
|
||||
|
||||
|
||||
,
|
||||
"top_n_vars": {
|
||||
"type":
|
||||
"string",
|
||||
"description": "Type: List of `integer`, default: `50,100,200,500`, multiple_sep: `\",\"`. Number of top vars to be used to calculate cumulative proportions",
|
||||
"help_text": "Type: List of `integer`, default: `50,100,200,500`, multiple_sep: `\",\"`. Number of top vars to be used to calculate cumulative proportions.\nIf not specified, proportions are not calculated. `--top_n_vars 20,50` finds\ncumulative proportion to the 20th and 50th most expressed vars.\n"
|
||||
,
|
||||
"default":"50,100,200,500"
|
||||
}
|
||||
|
||||
|
||||
,
|
||||
"output_obs_num_nonzero_vars": {
|
||||
"type":
|
||||
"string",
|
||||
"description": "Type: `string`, default: `num_nonzero_vars`. Name of column in ",
|
||||
"help_text": "Type: `string`, default: `num_nonzero_vars`. Name of column in .obs describing, for each observation, the number of stored values\n(including explicit zeroes). In other words, the name of the column that counts\nfor each row the number of columns that contain data.\n"
|
||||
,
|
||||
"default":"num_nonzero_vars"
|
||||
}
|
||||
|
||||
|
||||
,
|
||||
"output_obs_total_counts_vars": {
|
||||
"type":
|
||||
"string",
|
||||
"description": "Type: `string`, default: `total_counts`. Name of the column for ",
|
||||
"help_text": "Type: `string`, default: `total_counts`. Name of the column for .obs describing, for each observation (row),\nthe sum of the stored values in the columns.\n"
|
||||
,
|
||||
"default":"total_counts"
|
||||
}
|
||||
|
||||
|
||||
,
|
||||
"output_var_num_nonzero_obs": {
|
||||
"type":
|
||||
"string",
|
||||
"description": "Type: `string`, default: `num_nonzero_obs`. Name of column describing, for each feature, the number of stored values\n(including explicit zeroes)",
|
||||
"help_text": "Type: `string`, default: `num_nonzero_obs`. Name of column describing, for each feature, the number of stored values\n(including explicit zeroes). In other words, the name of the column that counts\nfor each column the number of rows that contain data.\n"
|
||||
,
|
||||
"default":"num_nonzero_obs"
|
||||
}
|
||||
|
||||
|
||||
,
|
||||
"output_var_total_counts_obs": {
|
||||
"type":
|
||||
"string",
|
||||
"description": "Type: `string`, default: `total_counts`. Name of the column in ",
|
||||
"help_text": "Type: `string`, default: `total_counts`. Name of the column in .var describing, for each feature (column),\nthe sum of the stored values in the rows.\n"
|
||||
,
|
||||
"default":"total_counts"
|
||||
}
|
||||
|
||||
|
||||
,
|
||||
"output_var_obs_mean": {
|
||||
"type":
|
||||
"string",
|
||||
"description": "Type: `string`, default: `obs_mean`. Name of the column in ",
|
||||
"help_text": "Type: `string`, default: `obs_mean`. Name of the column in .obs providing the mean of the values in each row.\n"
|
||||
,
|
||||
"default":"obs_mean"
|
||||
}
|
||||
|
||||
|
||||
,
|
||||
"output_var_pct_dropout": {
|
||||
"type":
|
||||
"string",
|
||||
"description": "Type: `string`, default: `pct_dropout`. Name of the column in ",
|
||||
"help_text": "Type: `string`, default: `pct_dropout`. Name of the column in .obs providing for each feature the percentage of\nobservations the feature does not appear on (i.e. is missing). Same as `--output_var_num_nonzero_obs`\nbut percentage based.\n"
|
||||
,
|
||||
"default":"pct_dropout"
|
||||
}
|
||||
|
||||
|
||||
}
|
||||
},
|
||||
|
||||
|
||||
"nextflow input-output arguments" : {
|
||||
"title": "Nextflow input-output arguments",
|
||||
"type": "object",
|
||||
"description": "Input/output parameters for Nextflow itself. Please note that both publishDir and publish_dir are supported but at least one has to be configured.",
|
||||
"properties": {
|
||||
|
||||
|
||||
"publish_dir": {
|
||||
"type":
|
||||
"string",
|
||||
"description": "Type: `string`, required, example: `output/`. Path to an output directory",
|
||||
"help_text": "Type: `string`, required, example: `output/`. Path to an output directory."
|
||||
|
||||
}
|
||||
|
||||
|
||||
|
||||
|
||||
}
|
||||
}
|
||||
},
|
||||
"allOf": [
|
||||
|
||||
{
|
||||
"$ref": "#/definitions/inputs"
|
||||
},
|
||||
|
||||
{
|
||||
"$ref": "#/definitions/outputs"
|
||||
},
|
||||
|
||||
{
|
||||
"$ref": "#/definitions/mitochondrial & ribosomal gene detection"
|
||||
},
|
||||
|
||||
{
|
||||
"$ref": "#/definitions/qc metrics calculation options"
|
||||
},
|
||||
|
||||
{
|
||||
"$ref": "#/definitions/nextflow input-output arguments"
|
||||
}
|
||||
]
|
||||
}
|
||||
@@ -0,0 +1 @@
|
||||
process.errorStrategy = 'ignore'
|
||||
@@ -0,0 +1,36 @@
|
||||
profiles {
|
||||
|
||||
// detect tempdir
|
||||
tempDir = java.nio.file.Paths.get(
|
||||
System.getenv('NXF_TEMP') ?:
|
||||
System.getenv('VIASH_TEMP') ?:
|
||||
System.getenv('TEMPDIR') ?:
|
||||
System.getenv('TMPDIR') ?:
|
||||
'/tmp'
|
||||
).toAbsolutePath()
|
||||
|
||||
mount_temp {
|
||||
docker.temp = tempDir
|
||||
podman.temp = tempDir
|
||||
charliecloud.temp = tempDir
|
||||
}
|
||||
|
||||
no_publish {
|
||||
process {
|
||||
withName: '.*' {
|
||||
publishDir = [
|
||||
enabled: false
|
||||
]
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
docker {
|
||||
docker.enabled = true
|
||||
// docker.userEmulation = true
|
||||
singularity.enabled = false
|
||||
podman.enabled = false
|
||||
shifter.enabled = false
|
||||
charliecloud.enabled = false
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,66 @@
|
||||
process {
|
||||
// Default resources for components that hardly do any processing
|
||||
memory = { 2.GB * task.attempt }
|
||||
cpus = 1
|
||||
|
||||
// Retry for exit codes that have something to do with memory issues
|
||||
errorStrategy = { task.exitStatus in 137..140 ? 'retry' : 'terminate' }
|
||||
maxRetries = 3
|
||||
maxMemory = null
|
||||
|
||||
// CPU resources
|
||||
withLabel: singlecpu { cpus = 1 }
|
||||
withLabel: lowcpu { cpus = 4 }
|
||||
withLabel: midcpu { cpus = 10 }
|
||||
withLabel: highcpu { cpus = 20 }
|
||||
|
||||
// Memory resources
|
||||
withLabel: lowmem { memory = { get_memory( 4.GB * task.attempt ) } }
|
||||
withLabel: midmem { memory = { get_memory( 25.GB * task.attempt ) } }
|
||||
withLabel: highmem { memory = { get_memory( 50.GB * task.attempt ) } }
|
||||
withLabel: veryhighmem { memory = { get_memory( 75.GB * task.attempt ) } }
|
||||
|
||||
// Disk space
|
||||
withLabel: lowdisk {
|
||||
disk = {process.disk ? process.disk : null}
|
||||
}
|
||||
withLabel: middisk {
|
||||
disk = {process.disk ? process.disk : null}
|
||||
}
|
||||
withLabel: highdisk {
|
||||
disk = {process.disk ? process.disk : null}
|
||||
}
|
||||
withLabel: veryhighdisk {
|
||||
disk = {process.disk ? process.disk : null}
|
||||
}
|
||||
|
||||
// NOTE: The above labels intentionally do not have an effect by default.
|
||||
// The user should set the disk space requirements by adding the following
|
||||
// to the compute environment:
|
||||
//
|
||||
// withLabel: lowdisk { disk = { 20.GB * task.attempt } }
|
||||
// withLabel: middisk { disk = { 100.GB * task.attempt } }
|
||||
// withLabel: highdisk { disk = { 200.GB * task.attempt } }
|
||||
// withLabel: veryhighdisk { disk = { 500.GB * task.attempt } }
|
||||
}
|
||||
|
||||
def get_memory(to_compare) {
|
||||
if (!process.containsKey("maxMemory") || !process.maxMemory) {
|
||||
return to_compare
|
||||
}
|
||||
|
||||
try {
|
||||
if (process.containsKey("maxRetries") && process.maxRetries && task.attempt == (process.maxRetries as int)) {
|
||||
return process.maxMemory
|
||||
}
|
||||
else if (to_compare.compareTo(process.maxMemory as nextflow.util.MemoryUnit) == 1) {
|
||||
return max_memory as nextflow.util.MemoryUnit
|
||||
}
|
||||
else {
|
||||
return to_compare
|
||||
}
|
||||
} catch (all) {
|
||||
println "Error processing memory resources. Please check that process.maxMemory '${process.maxMemory}' and process.maxRetries '${process.maxRetries}' are valid!"
|
||||
System.exit(1)
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,33 @@
|
||||
process {
|
||||
withLabel: lowmem { memory = 13.Gb }
|
||||
withLabel: lowcpu { cpus = 4 }
|
||||
withLabel: midmem { memory = 13.Gb }
|
||||
withLabel: midcpu { cpus = 4 }
|
||||
withLabel: highmem { memory = 13.Gb }
|
||||
withLabel: highcpu { cpus = 4 }
|
||||
withLabel: veryhighmem { memory = 13.Gb }
|
||||
withLabel: lowdisk {
|
||||
disk = {process.disk ? process.disk : null}
|
||||
}
|
||||
withLabel: middisk {
|
||||
disk = {process.disk ? process.disk : null}
|
||||
}
|
||||
withLabel: highdisk {
|
||||
disk = {process.disk ? process.disk : null}
|
||||
}
|
||||
withLabel: veryhighdisk {
|
||||
disk = {process.disk ? process.disk : null}
|
||||
}
|
||||
}
|
||||
|
||||
env.NUMBA_CACHE_DIR = '/tmp'
|
||||
|
||||
trace {
|
||||
enabled = true
|
||||
overwrite = true
|
||||
}
|
||||
dag {
|
||||
overwrite = true
|
||||
}
|
||||
|
||||
process.maxForks = 1
|
||||
@@ -0,0 +1,224 @@
|
||||
name: "split_modalities"
|
||||
namespace: "workflows/multiomics"
|
||||
version: "disable-scrublet_build"
|
||||
authors:
|
||||
- name: "Dries Schaumont"
|
||||
roles:
|
||||
- "author"
|
||||
- "maintainer"
|
||||
info:
|
||||
role: "Core Team Member"
|
||||
links:
|
||||
email: "dries@data-intuitive.com"
|
||||
github: "DriesSchaumont"
|
||||
orcid: "0000-0002-4389-0440"
|
||||
linkedin: "dries-schaumont"
|
||||
organizations:
|
||||
- name: "Data Intuitive"
|
||||
href: "https://www.data-intuitive.com"
|
||||
role: "Data Scientist"
|
||||
argument_groups:
|
||||
- name: "Inputs"
|
||||
arguments:
|
||||
- type: "string"
|
||||
name: "--id"
|
||||
description: "ID of the sample."
|
||||
info: null
|
||||
example:
|
||||
- "foo"
|
||||
required: true
|
||||
direction: "input"
|
||||
multiple: false
|
||||
multiple_sep: ";"
|
||||
- type: "file"
|
||||
name: "--input"
|
||||
alternatives:
|
||||
- "-i"
|
||||
description: "Path to the sample."
|
||||
info: null
|
||||
example:
|
||||
- "input.h5mu"
|
||||
must_exist: true
|
||||
create_parent: true
|
||||
required: true
|
||||
direction: "input"
|
||||
multiple: false
|
||||
multiple_sep: ";"
|
||||
- name: "Outputs"
|
||||
arguments:
|
||||
- type: "file"
|
||||
name: "--output"
|
||||
alternatives:
|
||||
- "-o"
|
||||
description: "Output directory containing multiple h5mu files."
|
||||
info: null
|
||||
example:
|
||||
- "/path/to/output"
|
||||
must_exist: true
|
||||
create_parent: true
|
||||
required: true
|
||||
direction: "output"
|
||||
multiple: false
|
||||
multiple_sep: ";"
|
||||
- type: "file"
|
||||
name: "--output_types"
|
||||
description: "A csv containing the base filename and modality type per output\
|
||||
\ file."
|
||||
info: null
|
||||
example:
|
||||
- "types.csv"
|
||||
must_exist: true
|
||||
create_parent: true
|
||||
required: true
|
||||
direction: "output"
|
||||
multiple: false
|
||||
multiple_sep: ";"
|
||||
resources:
|
||||
- type: "nextflow_script"
|
||||
path: "main.nf"
|
||||
is_executable: true
|
||||
entrypoint: "run_wf"
|
||||
- type: "file"
|
||||
path: "utils"
|
||||
- type: "file"
|
||||
path: "nextflow_labels.config"
|
||||
dest: "nextflow_labels.config"
|
||||
description: "A pipeline to split a multimodal mudata files into several unimodal\
|
||||
\ mudata files."
|
||||
test_resources:
|
||||
- type: "nextflow_script"
|
||||
path: "test.nf"
|
||||
is_executable: true
|
||||
entrypoint: "test_wf"
|
||||
- type: "file"
|
||||
path: "pbmc_1k_protein_v3_filtered_feature_bc_matrix.h5mu"
|
||||
info:
|
||||
test_dependencies:
|
||||
- name: "split_modalities_test"
|
||||
namespace: "test_workflows/multiomics"
|
||||
status: "enabled"
|
||||
scope:
|
||||
image: "private"
|
||||
target: "private"
|
||||
dependencies:
|
||||
- name: "dataflow/split_modalities"
|
||||
alias: "split_modalities_component"
|
||||
repository:
|
||||
type: "local"
|
||||
license: "MIT"
|
||||
links:
|
||||
repository: "https://github.com/openpipelines-bio/openpipeline"
|
||||
docker_registry: "ghcr.io"
|
||||
runners:
|
||||
- type: "nextflow"
|
||||
id: "nextflow"
|
||||
directives:
|
||||
tag: "$id"
|
||||
auto:
|
||||
simplifyInput: true
|
||||
simplifyOutput: false
|
||||
transcript: false
|
||||
publish: false
|
||||
config:
|
||||
labels:
|
||||
mem1gb: "memory = 1000000000.B"
|
||||
mem2gb: "memory = 2000000000.B"
|
||||
mem5gb: "memory = 5000000000.B"
|
||||
mem10gb: "memory = 10000000000.B"
|
||||
mem20gb: "memory = 20000000000.B"
|
||||
mem50gb: "memory = 50000000000.B"
|
||||
mem100gb: "memory = 100000000000.B"
|
||||
mem200gb: "memory = 200000000000.B"
|
||||
mem500gb: "memory = 500000000000.B"
|
||||
mem1tb: "memory = 1000000000000.B"
|
||||
mem2tb: "memory = 2000000000000.B"
|
||||
mem5tb: "memory = 5000000000000.B"
|
||||
mem10tb: "memory = 10000000000000.B"
|
||||
mem20tb: "memory = 20000000000000.B"
|
||||
mem50tb: "memory = 50000000000000.B"
|
||||
mem100tb: "memory = 100000000000000.B"
|
||||
mem200tb: "memory = 200000000000000.B"
|
||||
mem500tb: "memory = 500000000000000.B"
|
||||
mem1gib: "memory = 1073741824.B"
|
||||
mem2gib: "memory = 2147483648.B"
|
||||
mem4gib: "memory = 4294967296.B"
|
||||
mem8gib: "memory = 8589934592.B"
|
||||
mem16gib: "memory = 17179869184.B"
|
||||
mem32gib: "memory = 34359738368.B"
|
||||
mem64gib: "memory = 68719476736.B"
|
||||
mem128gib: "memory = 137438953472.B"
|
||||
mem256gib: "memory = 274877906944.B"
|
||||
mem512gib: "memory = 549755813888.B"
|
||||
mem1tib: "memory = 1099511627776.B"
|
||||
mem2tib: "memory = 2199023255552.B"
|
||||
mem4tib: "memory = 4398046511104.B"
|
||||
mem8tib: "memory = 8796093022208.B"
|
||||
mem16tib: "memory = 17592186044416.B"
|
||||
mem32tib: "memory = 35184372088832.B"
|
||||
mem64tib: "memory = 70368744177664.B"
|
||||
mem128tib: "memory = 140737488355328.B"
|
||||
mem256tib: "memory = 281474976710656.B"
|
||||
mem512tib: "memory = 562949953421312.B"
|
||||
cpu1: "cpus = 1"
|
||||
cpu2: "cpus = 2"
|
||||
cpu5: "cpus = 5"
|
||||
cpu10: "cpus = 10"
|
||||
cpu20: "cpus = 20"
|
||||
cpu50: "cpus = 50"
|
||||
cpu100: "cpus = 100"
|
||||
cpu200: "cpus = 200"
|
||||
cpu500: "cpus = 500"
|
||||
cpu1000: "cpus = 1000"
|
||||
script:
|
||||
- "includeConfig(\"nextflow_labels.config\")"
|
||||
debug: false
|
||||
container: "docker"
|
||||
build_info:
|
||||
config: "src/workflows/multiomics/split_modalities/config.vsh.yaml"
|
||||
runner: "nextflow"
|
||||
engine: "native"
|
||||
output: "target/_private/nextflow/workflows/multiomics/split_modalities"
|
||||
executable: "target/_private/nextflow/workflows/multiomics/split_modalities/main.nf"
|
||||
viash_version: "0.9.4"
|
||||
git_commit: "07297b53180b28c8e198414328683e941eec7ed0"
|
||||
git_remote: "https://github.com/openpipelines-bio/openpipeline"
|
||||
git_tag: "0.2.0-2044-g07297b53180"
|
||||
dependencies:
|
||||
- "target/nextflow/dataflow/split_modalities"
|
||||
package_config:
|
||||
name: "openpipeline"
|
||||
summary: "Best-practice workflows for single-cell multi-omics analyses.\n"
|
||||
description: "OpenPipelines are extensible single cell analysis pipelines for reproducible\
|
||||
\ and large-scale single cell processing using [Viash](https://viash.io) and [Nextflow](https://www.nextflow.io/).\n\
|
||||
\nIn terms of workflows, the following has been made available, but keep in mind\
|
||||
\ that\nindividual tools and functionality can be executed as standalone components\
|
||||
\ as well.\n\n * Demultiplexing: conversion of raw sequencing data to FASTQ objects.\n\
|
||||
\ * Ingestion: Read mapping and generating a count matrix.\n * Single sample\
|
||||
\ processing: cell filtering and doublet detection.\n * Multisample processing:\
|
||||
\ Count transformation, normalization, QC metric calulations.\n * Integration:\
|
||||
\ Clustering, integration and batch correction using single and multimodal methods.\n\
|
||||
\ * Downstream analysis workflows\n"
|
||||
info:
|
||||
test_resources:
|
||||
- type: "s3"
|
||||
path: "s3://openpipelines-data"
|
||||
dest: "resources_test"
|
||||
viash_version: "0.9.4"
|
||||
source: "src"
|
||||
target: "target"
|
||||
config_mods:
|
||||
- ".resources += {path: '/src/workflows/utils/labels.config', dest: 'nextflow_labels.config'}\n\
|
||||
.runners[.type == 'nextflow'].config.script := 'includeConfig(\"nextflow_labels.config\"\
|
||||
)'"
|
||||
- ".version := \"disable-scrublet_build\""
|
||||
keywords:
|
||||
- "single-cell"
|
||||
- "multimodal"
|
||||
license: "MIT"
|
||||
organization: "openpipelines-bio"
|
||||
links:
|
||||
repository: "https://github.com/openpipelines-bio/openpipeline"
|
||||
docker_registry: "ghcr.io"
|
||||
homepage: "https://openpipelines.bio"
|
||||
documentation: "https://openpipelines.bio/fundamentals"
|
||||
issue_tracker: "https://github.com/openpipelines-bio/openpipeline/issues"
|
||||
File diff suppressed because it is too large
Load Diff
@@ -0,0 +1,126 @@
|
||||
manifest {
|
||||
name = 'workflows/multiomics/split_modalities'
|
||||
mainScript = 'main.nf'
|
||||
nextflowVersion = '!>=20.12.1-edge'
|
||||
version = 'disable-scrublet_build'
|
||||
description = 'A pipeline to split a multimodal mudata files into several unimodal mudata files.'
|
||||
author = 'Dries Schaumont'
|
||||
}
|
||||
|
||||
process.container = 'nextflow/bash:latest'
|
||||
|
||||
// detect tempdir
|
||||
tempDir = java.nio.file.Paths.get(
|
||||
System.getenv('NXF_TEMP') ?:
|
||||
System.getenv('VIASH_TEMP') ?:
|
||||
System.getenv('TEMPDIR') ?:
|
||||
System.getenv('TMPDIR') ?:
|
||||
'/tmp'
|
||||
).toAbsolutePath()
|
||||
|
||||
profiles {
|
||||
no_publish {
|
||||
process {
|
||||
withName: '.*' {
|
||||
publishDir = [
|
||||
enabled: false
|
||||
]
|
||||
}
|
||||
}
|
||||
}
|
||||
mount_temp {
|
||||
docker.temp = tempDir
|
||||
podman.temp = tempDir
|
||||
charliecloud.temp = tempDir
|
||||
}
|
||||
docker {
|
||||
docker.enabled = true
|
||||
// docker.userEmulation = true
|
||||
singularity.enabled = false
|
||||
podman.enabled = false
|
||||
shifter.enabled = false
|
||||
charliecloud.enabled = false
|
||||
}
|
||||
singularity {
|
||||
singularity.enabled = true
|
||||
singularity.autoMounts = true
|
||||
docker.enabled = false
|
||||
podman.enabled = false
|
||||
shifter.enabled = false
|
||||
charliecloud.enabled = false
|
||||
}
|
||||
podman {
|
||||
podman.enabled = true
|
||||
docker.enabled = false
|
||||
singularity.enabled = false
|
||||
shifter.enabled = false
|
||||
charliecloud.enabled = false
|
||||
}
|
||||
shifter {
|
||||
shifter.enabled = true
|
||||
docker.enabled = false
|
||||
singularity.enabled = false
|
||||
podman.enabled = false
|
||||
charliecloud.enabled = false
|
||||
}
|
||||
charliecloud {
|
||||
charliecloud.enabled = true
|
||||
docker.enabled = false
|
||||
singularity.enabled = false
|
||||
podman.enabled = false
|
||||
shifter.enabled = false
|
||||
}
|
||||
}
|
||||
|
||||
process{
|
||||
withLabel: mem1gb { memory = 1000000000.B }
|
||||
withLabel: mem2gb { memory = 2000000000.B }
|
||||
withLabel: mem5gb { memory = 5000000000.B }
|
||||
withLabel: mem10gb { memory = 10000000000.B }
|
||||
withLabel: mem20gb { memory = 20000000000.B }
|
||||
withLabel: mem50gb { memory = 50000000000.B }
|
||||
withLabel: mem100gb { memory = 100000000000.B }
|
||||
withLabel: mem200gb { memory = 200000000000.B }
|
||||
withLabel: mem500gb { memory = 500000000000.B }
|
||||
withLabel: mem1tb { memory = 1000000000000.B }
|
||||
withLabel: mem2tb { memory = 2000000000000.B }
|
||||
withLabel: mem5tb { memory = 5000000000000.B }
|
||||
withLabel: mem10tb { memory = 10000000000000.B }
|
||||
withLabel: mem20tb { memory = 20000000000000.B }
|
||||
withLabel: mem50tb { memory = 50000000000000.B }
|
||||
withLabel: mem100tb { memory = 100000000000000.B }
|
||||
withLabel: mem200tb { memory = 200000000000000.B }
|
||||
withLabel: mem500tb { memory = 500000000000000.B }
|
||||
withLabel: mem1gib { memory = 1073741824.B }
|
||||
withLabel: mem2gib { memory = 2147483648.B }
|
||||
withLabel: mem4gib { memory = 4294967296.B }
|
||||
withLabel: mem8gib { memory = 8589934592.B }
|
||||
withLabel: mem16gib { memory = 17179869184.B }
|
||||
withLabel: mem32gib { memory = 34359738368.B }
|
||||
withLabel: mem64gib { memory = 68719476736.B }
|
||||
withLabel: mem128gib { memory = 137438953472.B }
|
||||
withLabel: mem256gib { memory = 274877906944.B }
|
||||
withLabel: mem512gib { memory = 549755813888.B }
|
||||
withLabel: mem1tib { memory = 1099511627776.B }
|
||||
withLabel: mem2tib { memory = 2199023255552.B }
|
||||
withLabel: mem4tib { memory = 4398046511104.B }
|
||||
withLabel: mem8tib { memory = 8796093022208.B }
|
||||
withLabel: mem16tib { memory = 17592186044416.B }
|
||||
withLabel: mem32tib { memory = 35184372088832.B }
|
||||
withLabel: mem64tib { memory = 70368744177664.B }
|
||||
withLabel: mem128tib { memory = 140737488355328.B }
|
||||
withLabel: mem256tib { memory = 281474976710656.B }
|
||||
withLabel: mem512tib { memory = 562949953421312.B }
|
||||
withLabel: cpu1 { cpus = 1 }
|
||||
withLabel: cpu2 { cpus = 2 }
|
||||
withLabel: cpu5 { cpus = 5 }
|
||||
withLabel: cpu10 { cpus = 10 }
|
||||
withLabel: cpu20 { cpus = 20 }
|
||||
withLabel: cpu50 { cpus = 50 }
|
||||
withLabel: cpu100 { cpus = 100 }
|
||||
withLabel: cpu200 { cpus = 200 }
|
||||
withLabel: cpu500 { cpus = 500 }
|
||||
withLabel: cpu1000 { cpus = 1000 }
|
||||
}
|
||||
|
||||
includeConfig("nextflow_labels.config")
|
||||
@@ -0,0 +1,66 @@
|
||||
process {
|
||||
// Default resources for components that hardly do any processing
|
||||
memory = { 2.GB * task.attempt }
|
||||
cpus = 1
|
||||
|
||||
// Retry for exit codes that have something to do with memory issues
|
||||
errorStrategy = { task.exitStatus in 137..140 ? 'retry' : 'terminate' }
|
||||
maxRetries = 3
|
||||
maxMemory = null
|
||||
|
||||
// CPU resources
|
||||
withLabel: singlecpu { cpus = 1 }
|
||||
withLabel: lowcpu { cpus = 4 }
|
||||
withLabel: midcpu { cpus = 10 }
|
||||
withLabel: highcpu { cpus = 20 }
|
||||
|
||||
// Memory resources
|
||||
withLabel: lowmem { memory = { get_memory( 4.GB * task.attempt ) } }
|
||||
withLabel: midmem { memory = { get_memory( 25.GB * task.attempt ) } }
|
||||
withLabel: highmem { memory = { get_memory( 50.GB * task.attempt ) } }
|
||||
withLabel: veryhighmem { memory = { get_memory( 75.GB * task.attempt ) } }
|
||||
|
||||
// Disk space
|
||||
withLabel: lowdisk {
|
||||
disk = {process.disk ? process.disk : null}
|
||||
}
|
||||
withLabel: middisk {
|
||||
disk = {process.disk ? process.disk : null}
|
||||
}
|
||||
withLabel: highdisk {
|
||||
disk = {process.disk ? process.disk : null}
|
||||
}
|
||||
withLabel: veryhighdisk {
|
||||
disk = {process.disk ? process.disk : null}
|
||||
}
|
||||
|
||||
// NOTE: The above labels intentionally do not have an effect by default.
|
||||
// The user should set the disk space requirements by adding the following
|
||||
// to the compute environment:
|
||||
//
|
||||
// withLabel: lowdisk { disk = { 20.GB * task.attempt } }
|
||||
// withLabel: middisk { disk = { 100.GB * task.attempt } }
|
||||
// withLabel: highdisk { disk = { 200.GB * task.attempt } }
|
||||
// withLabel: veryhighdisk { disk = { 500.GB * task.attempt } }
|
||||
}
|
||||
|
||||
def get_memory(to_compare) {
|
||||
if (!process.containsKey("maxMemory") || !process.maxMemory) {
|
||||
return to_compare
|
||||
}
|
||||
|
||||
try {
|
||||
if (process.containsKey("maxRetries") && process.maxRetries && task.attempt == (process.maxRetries as int)) {
|
||||
return process.maxMemory
|
||||
}
|
||||
else if (to_compare.compareTo(process.maxMemory as nextflow.util.MemoryUnit) == 1) {
|
||||
return max_memory as nextflow.util.MemoryUnit
|
||||
}
|
||||
else {
|
||||
return to_compare
|
||||
}
|
||||
} catch (all) {
|
||||
println "Error processing memory resources. Please check that process.maxMemory '${process.maxMemory}' and process.maxRetries '${process.maxRetries}' are valid!"
|
||||
System.exit(1)
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1 @@
|
||||
process.errorStrategy = 'ignore'
|
||||
@@ -0,0 +1,36 @@
|
||||
profiles {
|
||||
|
||||
// detect tempdir
|
||||
tempDir = java.nio.file.Paths.get(
|
||||
System.getenv('NXF_TEMP') ?:
|
||||
System.getenv('VIASH_TEMP') ?:
|
||||
System.getenv('TEMPDIR') ?:
|
||||
System.getenv('TMPDIR') ?:
|
||||
'/tmp'
|
||||
).toAbsolutePath()
|
||||
|
||||
mount_temp {
|
||||
docker.temp = tempDir
|
||||
podman.temp = tempDir
|
||||
charliecloud.temp = tempDir
|
||||
}
|
||||
|
||||
no_publish {
|
||||
process {
|
||||
withName: '.*' {
|
||||
publishDir = [
|
||||
enabled: false
|
||||
]
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
docker {
|
||||
docker.enabled = true
|
||||
// docker.userEmulation = true
|
||||
singularity.enabled = false
|
||||
podman.enabled = false
|
||||
shifter.enabled = false
|
||||
charliecloud.enabled = false
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,66 @@
|
||||
process {
|
||||
// Default resources for components that hardly do any processing
|
||||
memory = { 2.GB * task.attempt }
|
||||
cpus = 1
|
||||
|
||||
// Retry for exit codes that have something to do with memory issues
|
||||
errorStrategy = { task.exitStatus in 137..140 ? 'retry' : 'terminate' }
|
||||
maxRetries = 3
|
||||
maxMemory = null
|
||||
|
||||
// CPU resources
|
||||
withLabel: singlecpu { cpus = 1 }
|
||||
withLabel: lowcpu { cpus = 4 }
|
||||
withLabel: midcpu { cpus = 10 }
|
||||
withLabel: highcpu { cpus = 20 }
|
||||
|
||||
// Memory resources
|
||||
withLabel: lowmem { memory = { get_memory( 4.GB * task.attempt ) } }
|
||||
withLabel: midmem { memory = { get_memory( 25.GB * task.attempt ) } }
|
||||
withLabel: highmem { memory = { get_memory( 50.GB * task.attempt ) } }
|
||||
withLabel: veryhighmem { memory = { get_memory( 75.GB * task.attempt ) } }
|
||||
|
||||
// Disk space
|
||||
withLabel: lowdisk {
|
||||
disk = {process.disk ? process.disk : null}
|
||||
}
|
||||
withLabel: middisk {
|
||||
disk = {process.disk ? process.disk : null}
|
||||
}
|
||||
withLabel: highdisk {
|
||||
disk = {process.disk ? process.disk : null}
|
||||
}
|
||||
withLabel: veryhighdisk {
|
||||
disk = {process.disk ? process.disk : null}
|
||||
}
|
||||
|
||||
// NOTE: The above labels intentionally do not have an effect by default.
|
||||
// The user should set the disk space requirements by adding the following
|
||||
// to the compute environment:
|
||||
//
|
||||
// withLabel: lowdisk { disk = { 20.GB * task.attempt } }
|
||||
// withLabel: middisk { disk = { 100.GB * task.attempt } }
|
||||
// withLabel: highdisk { disk = { 200.GB * task.attempt } }
|
||||
// withLabel: veryhighdisk { disk = { 500.GB * task.attempt } }
|
||||
}
|
||||
|
||||
def get_memory(to_compare) {
|
||||
if (!process.containsKey("maxMemory") || !process.maxMemory) {
|
||||
return to_compare
|
||||
}
|
||||
|
||||
try {
|
||||
if (process.containsKey("maxRetries") && process.maxRetries && task.attempt == (process.maxRetries as int)) {
|
||||
return process.maxMemory
|
||||
}
|
||||
else if (to_compare.compareTo(process.maxMemory as nextflow.util.MemoryUnit) == 1) {
|
||||
return max_memory as nextflow.util.MemoryUnit
|
||||
}
|
||||
else {
|
||||
return to_compare
|
||||
}
|
||||
} catch (all) {
|
||||
println "Error processing memory resources. Please check that process.maxMemory '${process.maxMemory}' and process.maxRetries '${process.maxRetries}' are valid!"
|
||||
System.exit(1)
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,33 @@
|
||||
process {
|
||||
withLabel: lowmem { memory = 13.Gb }
|
||||
withLabel: lowcpu { cpus = 4 }
|
||||
withLabel: midmem { memory = 13.Gb }
|
||||
withLabel: midcpu { cpus = 4 }
|
||||
withLabel: highmem { memory = 13.Gb }
|
||||
withLabel: highcpu { cpus = 4 }
|
||||
withLabel: veryhighmem { memory = 13.Gb }
|
||||
withLabel: lowdisk {
|
||||
disk = {process.disk ? process.disk : null}
|
||||
}
|
||||
withLabel: middisk {
|
||||
disk = {process.disk ? process.disk : null}
|
||||
}
|
||||
withLabel: highdisk {
|
||||
disk = {process.disk ? process.disk : null}
|
||||
}
|
||||
withLabel: veryhighdisk {
|
||||
disk = {process.disk ? process.disk : null}
|
||||
}
|
||||
}
|
||||
|
||||
env.NUMBA_CACHE_DIR = '/tmp'
|
||||
|
||||
trace {
|
||||
enabled = true
|
||||
overwrite = true
|
||||
}
|
||||
dag {
|
||||
overwrite = true
|
||||
}
|
||||
|
||||
process.maxForks = 1
|
||||
@@ -0,0 +1,296 @@
|
||||
name: "leiden"
|
||||
namespace: "cluster"
|
||||
version: "disable-scrublet_build"
|
||||
authors:
|
||||
- name: "Dries De Maeyer"
|
||||
roles:
|
||||
- "maintainer"
|
||||
info:
|
||||
role: "Core Team Member"
|
||||
links:
|
||||
email: "ddemaeyer@gmail.com"
|
||||
github: "ddemaeyer"
|
||||
linkedin: "dries-de-maeyer-b46a814"
|
||||
organizations:
|
||||
- name: "Janssen Pharmaceuticals"
|
||||
href: "https://www.janssen.com"
|
||||
role: "Principal Scientist"
|
||||
argument_groups:
|
||||
- name: "Arguments"
|
||||
arguments:
|
||||
- type: "file"
|
||||
name: "--input"
|
||||
alternatives:
|
||||
- "-i"
|
||||
description: "Input file."
|
||||
info: null
|
||||
example:
|
||||
- "input.h5mu"
|
||||
must_exist: true
|
||||
create_parent: true
|
||||
required: true
|
||||
direction: "input"
|
||||
multiple: false
|
||||
multiple_sep: ";"
|
||||
- type: "string"
|
||||
name: "--modality"
|
||||
description: "Which modality from the input MuData file to process.\n"
|
||||
info: null
|
||||
default:
|
||||
- "rna"
|
||||
required: false
|
||||
direction: "input"
|
||||
multiple: false
|
||||
multiple_sep: ";"
|
||||
- type: "string"
|
||||
name: "--obsp_connectivities"
|
||||
description: "In which .obsp slot the neighbor connectivities can be found."
|
||||
info: null
|
||||
default:
|
||||
- "connectivities"
|
||||
required: false
|
||||
direction: "input"
|
||||
multiple: false
|
||||
multiple_sep: ";"
|
||||
- type: "file"
|
||||
name: "--output"
|
||||
alternatives:
|
||||
- "-o"
|
||||
description: "Output file."
|
||||
info: null
|
||||
example:
|
||||
- "output.h5mu"
|
||||
must_exist: true
|
||||
create_parent: true
|
||||
required: true
|
||||
direction: "output"
|
||||
multiple: false
|
||||
multiple_sep: ";"
|
||||
- type: "string"
|
||||
name: "--obsm_name"
|
||||
description: "Name of the .obsm key under which to add the cluster labels.\nThe\
|
||||
\ name of the columns in the matrix will correspond to the resolutions.\n"
|
||||
info: null
|
||||
default:
|
||||
- "leiden"
|
||||
required: false
|
||||
direction: "input"
|
||||
multiple: false
|
||||
multiple_sep: ";"
|
||||
- type: "double"
|
||||
name: "--resolution"
|
||||
description: "A parameter value controlling the coarseness of the clustering.\
|
||||
\ Higher values lead to more clusters.\nMultiple values will result in clustering\
|
||||
\ being performed multiple times.\n"
|
||||
info: null
|
||||
default:
|
||||
- 1.0
|
||||
required: true
|
||||
direction: "input"
|
||||
multiple: true
|
||||
multiple_sep: ";"
|
||||
- type: "string"
|
||||
name: "--output_compression"
|
||||
description: "Compression format to use for the output AnnData and/or Mudata objects.\n\
|
||||
By default no compression is applied.\n"
|
||||
info: null
|
||||
example:
|
||||
- "gzip"
|
||||
required: false
|
||||
choices:
|
||||
- "gzip"
|
||||
- "lzf"
|
||||
direction: "input"
|
||||
multiple: false
|
||||
multiple_sep: ";"
|
||||
resources:
|
||||
- type: "python_script"
|
||||
path: "script.py"
|
||||
is_executable: true
|
||||
- type: "file"
|
||||
path: "setup_logger.py"
|
||||
- type: "file"
|
||||
path: "compress_h5mu.py"
|
||||
- type: "file"
|
||||
path: "nextflow_labels.config"
|
||||
dest: "nextflow_labels.config"
|
||||
description: "Cluster cells using the [Leiden algorithm] [Traag18] implemented in\
|
||||
\ the [Scanpy framework] [Wolf18]. \nLeiden is an improved version of the [Louvain\
|
||||
\ algorithm] [Blondel08]. \nIt has been proposed for single-cell analysis by [Levine15]\
|
||||
\ [Levine15]. \nThis requires having ran `neighbors/find_neighbors` or `neighbors/bbknn`\
|
||||
\ first.\n\n[Blondel08]: Blondel et al. (2008), Fast unfolding of communities in\
|
||||
\ large networks, J. Stat. Mech. \n[Levine15]: Levine et al. (2015), Data-Driven\
|
||||
\ Phenotypic Dissection of AML Reveals Progenitor-like Cells that Correlate with\
|
||||
\ Prognosis, Cell. \n[Traag18]: Traag et al. (2018), From Louvain to Leiden: guaranteeing\
|
||||
\ well-connected communities arXiv. \n[Wolf18]: Wolf et al. (2018), Scanpy: large-scale\
|
||||
\ single-cell gene expression data analysis, Genome Biology. \n"
|
||||
test_resources:
|
||||
- type: "python_script"
|
||||
path: "test.py"
|
||||
is_executable: true
|
||||
- type: "file"
|
||||
path: "pbmc_1k_protein_v3"
|
||||
info: null
|
||||
status: "enabled"
|
||||
scope:
|
||||
image: "public"
|
||||
target: "public"
|
||||
license: "MIT"
|
||||
links:
|
||||
repository: "https://github.com/openpipelines-bio/openpipeline"
|
||||
docker_registry: "ghcr.io"
|
||||
runners:
|
||||
- type: "executable"
|
||||
id: "executable"
|
||||
docker_setup_strategy: "ifneedbepullelsecachedbuild"
|
||||
- type: "nextflow"
|
||||
id: "nextflow"
|
||||
directives:
|
||||
label:
|
||||
- "highcpu"
|
||||
- "midmem"
|
||||
- "middisk"
|
||||
tag: "$id"
|
||||
auto:
|
||||
simplifyInput: true
|
||||
simplifyOutput: false
|
||||
transcript: false
|
||||
publish: false
|
||||
config:
|
||||
labels:
|
||||
mem1gb: "memory = 1000000000.B"
|
||||
mem2gb: "memory = 2000000000.B"
|
||||
mem5gb: "memory = 5000000000.B"
|
||||
mem10gb: "memory = 10000000000.B"
|
||||
mem20gb: "memory = 20000000000.B"
|
||||
mem50gb: "memory = 50000000000.B"
|
||||
mem100gb: "memory = 100000000000.B"
|
||||
mem200gb: "memory = 200000000000.B"
|
||||
mem500gb: "memory = 500000000000.B"
|
||||
mem1tb: "memory = 1000000000000.B"
|
||||
mem2tb: "memory = 2000000000000.B"
|
||||
mem5tb: "memory = 5000000000000.B"
|
||||
mem10tb: "memory = 10000000000000.B"
|
||||
mem20tb: "memory = 20000000000000.B"
|
||||
mem50tb: "memory = 50000000000000.B"
|
||||
mem100tb: "memory = 100000000000000.B"
|
||||
mem200tb: "memory = 200000000000000.B"
|
||||
mem500tb: "memory = 500000000000000.B"
|
||||
mem1gib: "memory = 1073741824.B"
|
||||
mem2gib: "memory = 2147483648.B"
|
||||
mem4gib: "memory = 4294967296.B"
|
||||
mem8gib: "memory = 8589934592.B"
|
||||
mem16gib: "memory = 17179869184.B"
|
||||
mem32gib: "memory = 34359738368.B"
|
||||
mem64gib: "memory = 68719476736.B"
|
||||
mem128gib: "memory = 137438953472.B"
|
||||
mem256gib: "memory = 274877906944.B"
|
||||
mem512gib: "memory = 549755813888.B"
|
||||
mem1tib: "memory = 1099511627776.B"
|
||||
mem2tib: "memory = 2199023255552.B"
|
||||
mem4tib: "memory = 4398046511104.B"
|
||||
mem8tib: "memory = 8796093022208.B"
|
||||
mem16tib: "memory = 17592186044416.B"
|
||||
mem32tib: "memory = 35184372088832.B"
|
||||
mem64tib: "memory = 70368744177664.B"
|
||||
mem128tib: "memory = 140737488355328.B"
|
||||
mem256tib: "memory = 281474976710656.B"
|
||||
mem512tib: "memory = 562949953421312.B"
|
||||
cpu1: "cpus = 1"
|
||||
cpu2: "cpus = 2"
|
||||
cpu5: "cpus = 5"
|
||||
cpu10: "cpus = 10"
|
||||
cpu20: "cpus = 20"
|
||||
cpu50: "cpus = 50"
|
||||
cpu100: "cpus = 100"
|
||||
cpu200: "cpus = 200"
|
||||
cpu500: "cpus = 500"
|
||||
cpu1000: "cpus = 1000"
|
||||
script:
|
||||
- "includeConfig(\"nextflow_labels.config\")"
|
||||
debug: false
|
||||
container: "docker"
|
||||
engines:
|
||||
- type: "docker"
|
||||
id: "docker"
|
||||
image: "python:3.13-slim"
|
||||
target_tag: "disable-scrublet_build"
|
||||
namespace_separator: "/"
|
||||
setup:
|
||||
- type: "apt"
|
||||
packages:
|
||||
- "procps"
|
||||
interactive: false
|
||||
- type: "python"
|
||||
user: false
|
||||
packages:
|
||||
- "anndata~=0.11.1"
|
||||
- "mudata~=0.3.1"
|
||||
- "scanpy~=1.10.4"
|
||||
- "leidenalg~=0.10.0"
|
||||
script:
|
||||
- "exec(\"try:\\n import awkward\\nexcept ModuleNotFoundError:\\n exit(0)\\\
|
||||
nelse: exit(1)\")"
|
||||
upgrade: true
|
||||
test_setup:
|
||||
- type: "apt"
|
||||
packages:
|
||||
- "git"
|
||||
interactive: false
|
||||
- type: "python"
|
||||
user: false
|
||||
packages:
|
||||
- "viashpy==0.8.0"
|
||||
github:
|
||||
- "openpipelines-bio/core#subdirectory=packages/python/openpipeline_testutils"
|
||||
upgrade: true
|
||||
entrypoint: []
|
||||
cmd: null
|
||||
build_info:
|
||||
config: "src/cluster/leiden/config.vsh.yaml"
|
||||
runner: "nextflow"
|
||||
engine: "docker"
|
||||
output: "target/nextflow/cluster/leiden"
|
||||
executable: "target/nextflow/cluster/leiden/main.nf"
|
||||
viash_version: "0.9.4"
|
||||
git_commit: "07297b53180b28c8e198414328683e941eec7ed0"
|
||||
git_remote: "https://github.com/openpipelines-bio/openpipeline"
|
||||
git_tag: "0.2.0-2044-g07297b53180"
|
||||
package_config:
|
||||
name: "openpipeline"
|
||||
summary: "Best-practice workflows for single-cell multi-omics analyses.\n"
|
||||
description: "OpenPipelines are extensible single cell analysis pipelines for reproducible\
|
||||
\ and large-scale single cell processing using [Viash](https://viash.io) and [Nextflow](https://www.nextflow.io/).\n\
|
||||
\nIn terms of workflows, the following has been made available, but keep in mind\
|
||||
\ that\nindividual tools and functionality can be executed as standalone components\
|
||||
\ as well.\n\n * Demultiplexing: conversion of raw sequencing data to FASTQ objects.\n\
|
||||
\ * Ingestion: Read mapping and generating a count matrix.\n * Single sample\
|
||||
\ processing: cell filtering and doublet detection.\n * Multisample processing:\
|
||||
\ Count transformation, normalization, QC metric calulations.\n * Integration:\
|
||||
\ Clustering, integration and batch correction using single and multimodal methods.\n\
|
||||
\ * Downstream analysis workflows\n"
|
||||
info:
|
||||
test_resources:
|
||||
- type: "s3"
|
||||
path: "s3://openpipelines-data"
|
||||
dest: "resources_test"
|
||||
viash_version: "0.9.4"
|
||||
source: "src"
|
||||
target: "target"
|
||||
config_mods:
|
||||
- ".resources += {path: '/src/workflows/utils/labels.config', dest: 'nextflow_labels.config'}\n\
|
||||
.runners[.type == 'nextflow'].config.script := 'includeConfig(\"nextflow_labels.config\"\
|
||||
)'"
|
||||
- ".version := \"disable-scrublet_build\""
|
||||
- ".engines[.type == 'docker'].target_tag := 'disable-scrublet_build'"
|
||||
keywords:
|
||||
- "single-cell"
|
||||
- "multimodal"
|
||||
license: "MIT"
|
||||
organization: "openpipelines-bio"
|
||||
links:
|
||||
repository: "https://github.com/openpipelines-bio/openpipeline"
|
||||
docker_registry: "ghcr.io"
|
||||
homepage: "https://openpipelines.bio"
|
||||
documentation: "https://openpipelines.bio/fundamentals"
|
||||
issue_tracker: "https://github.com/openpipelines-bio/openpipeline/issues"
|
||||
@@ -0,0 +1,87 @@
|
||||
import shutil
|
||||
from anndata import AnnData
|
||||
from mudata import write_h5ad
|
||||
from h5py import File as H5File
|
||||
from h5py import Group, Dataset
|
||||
from pathlib import Path
|
||||
from typing import Union, Literal
|
||||
from functools import partial
|
||||
|
||||
|
||||
def compress_h5mu(
|
||||
input_path: Union[str, Path],
|
||||
output_path: Union[str, Path],
|
||||
compression: Union[Literal["gzip"], Literal["lzf"]],
|
||||
):
|
||||
input_path, output_path = str(input_path), str(output_path)
|
||||
|
||||
def copy_attributes(in_object, out_object):
|
||||
for key, value in in_object.attrs.items():
|
||||
out_object.attrs[key] = value
|
||||
|
||||
def visit_path(
|
||||
output_h5: H5File,
|
||||
compression: Union[Literal["gzip"], Literal["lzf"]],
|
||||
name: str,
|
||||
object: Union[Group, Dataset],
|
||||
):
|
||||
if isinstance(object, Group):
|
||||
new_group = output_h5.create_group(name)
|
||||
copy_attributes(object, new_group)
|
||||
elif isinstance(object, Dataset):
|
||||
# Compression only works for non-scalar Dataset objects
|
||||
# Scalar objects dont have a shape defined
|
||||
if not object.compression and object.shape not in [None, ()]:
|
||||
new_dataset = output_h5.create_dataset(
|
||||
name, data=object, compression=compression
|
||||
)
|
||||
copy_attributes(object, new_dataset)
|
||||
else:
|
||||
output_h5.copy(object, name)
|
||||
else:
|
||||
raise NotImplementedError(
|
||||
f"Could not copy element {name}, "
|
||||
f"type has not been implemented yet: {type(object)}"
|
||||
)
|
||||
|
||||
with (
|
||||
H5File(input_path, "r") as input_h5,
|
||||
H5File(output_path, "w", userblock_size=512) as output_h5,
|
||||
):
|
||||
copy_attributes(input_h5, output_h5)
|
||||
input_h5.visititems(partial(visit_path, output_h5, compression))
|
||||
|
||||
with open(input_path, "rb") as input_bytes:
|
||||
# Mudata puts metadata like this in the first 512 bytes:
|
||||
# MuData (format-version=0.1.0;creator=muon;creator-version=0.2.0)
|
||||
# See mudata/_core/io.py, read_h5mu() function
|
||||
starting_metadata = input_bytes.read(100)
|
||||
# The metadata is padded with extra null bytes up until 512 bytes
|
||||
truncate_location = starting_metadata.find(b"\x00")
|
||||
starting_metadata = starting_metadata[:truncate_location]
|
||||
with open(output_path, "br+") as f:
|
||||
nbytes = f.write(starting_metadata)
|
||||
f.write(b"\0" * (512 - nbytes))
|
||||
|
||||
|
||||
def write_h5ad_to_h5mu_with_compression(
|
||||
output_file: Union[str, Path],
|
||||
h5mu: Union[str, Path],
|
||||
modality_name: str,
|
||||
modality_data: AnnData,
|
||||
output_compression=None,
|
||||
):
|
||||
output_file = Path(output_file)
|
||||
h5mu = Path(h5mu)
|
||||
output_file_uncompressed = (
|
||||
output_file.with_name(output_file.stem + "_uncompressed.h5mu")
|
||||
if output_compression
|
||||
else output_file
|
||||
)
|
||||
shutil.copyfile(h5mu, output_file_uncompressed)
|
||||
write_h5ad(filename=output_file_uncompressed, mod=modality_name, data=modality_data)
|
||||
if output_compression:
|
||||
compress_h5mu(
|
||||
output_file_uncompressed, output_file, compression=output_compression
|
||||
)
|
||||
output_file_uncompressed.unlink()
|
||||
File diff suppressed because it is too large
Load Diff
@@ -0,0 +1,126 @@
|
||||
manifest {
|
||||
name = 'cluster/leiden'
|
||||
mainScript = 'main.nf'
|
||||
nextflowVersion = '!>=20.12.1-edge'
|
||||
version = 'disable-scrublet_build'
|
||||
description = 'Cluster cells using the [Leiden algorithm] [Traag18] implemented in the [Scanpy framework] [Wolf18]. \nLeiden is an improved version of the [Louvain algorithm] [Blondel08]. \nIt has been proposed for single-cell analysis by [Levine15] [Levine15]. \nThis requires having ran `neighbors/find_neighbors` or `neighbors/bbknn` first.\n\n[Blondel08]: Blondel et al. (2008), Fast unfolding of communities in large networks, J. Stat. Mech. \n[Levine15]: Levine et al. (2015), Data-Driven Phenotypic Dissection of AML Reveals Progenitor-like Cells that Correlate with Prognosis, Cell. \n[Traag18]: Traag et al. (2018), From Louvain to Leiden: guaranteeing well-connected communities arXiv. \n[Wolf18]: Wolf et al. (2018), Scanpy: large-scale single-cell gene expression data analysis, Genome Biology. \n'
|
||||
author = 'Dries De Maeyer'
|
||||
}
|
||||
|
||||
process.container = 'nextflow/bash:latest'
|
||||
|
||||
// detect tempdir
|
||||
tempDir = java.nio.file.Paths.get(
|
||||
System.getenv('NXF_TEMP') ?:
|
||||
System.getenv('VIASH_TEMP') ?:
|
||||
System.getenv('TEMPDIR') ?:
|
||||
System.getenv('TMPDIR') ?:
|
||||
'/tmp'
|
||||
).toAbsolutePath()
|
||||
|
||||
profiles {
|
||||
no_publish {
|
||||
process {
|
||||
withName: '.*' {
|
||||
publishDir = [
|
||||
enabled: false
|
||||
]
|
||||
}
|
||||
}
|
||||
}
|
||||
mount_temp {
|
||||
docker.temp = tempDir
|
||||
podman.temp = tempDir
|
||||
charliecloud.temp = tempDir
|
||||
}
|
||||
docker {
|
||||
docker.enabled = true
|
||||
// docker.userEmulation = true
|
||||
singularity.enabled = false
|
||||
podman.enabled = false
|
||||
shifter.enabled = false
|
||||
charliecloud.enabled = false
|
||||
}
|
||||
singularity {
|
||||
singularity.enabled = true
|
||||
singularity.autoMounts = true
|
||||
docker.enabled = false
|
||||
podman.enabled = false
|
||||
shifter.enabled = false
|
||||
charliecloud.enabled = false
|
||||
}
|
||||
podman {
|
||||
podman.enabled = true
|
||||
docker.enabled = false
|
||||
singularity.enabled = false
|
||||
shifter.enabled = false
|
||||
charliecloud.enabled = false
|
||||
}
|
||||
shifter {
|
||||
shifter.enabled = true
|
||||
docker.enabled = false
|
||||
singularity.enabled = false
|
||||
podman.enabled = false
|
||||
charliecloud.enabled = false
|
||||
}
|
||||
charliecloud {
|
||||
charliecloud.enabled = true
|
||||
docker.enabled = false
|
||||
singularity.enabled = false
|
||||
podman.enabled = false
|
||||
shifter.enabled = false
|
||||
}
|
||||
}
|
||||
|
||||
process{
|
||||
withLabel: mem1gb { memory = 1000000000.B }
|
||||
withLabel: mem2gb { memory = 2000000000.B }
|
||||
withLabel: mem5gb { memory = 5000000000.B }
|
||||
withLabel: mem10gb { memory = 10000000000.B }
|
||||
withLabel: mem20gb { memory = 20000000000.B }
|
||||
withLabel: mem50gb { memory = 50000000000.B }
|
||||
withLabel: mem100gb { memory = 100000000000.B }
|
||||
withLabel: mem200gb { memory = 200000000000.B }
|
||||
withLabel: mem500gb { memory = 500000000000.B }
|
||||
withLabel: mem1tb { memory = 1000000000000.B }
|
||||
withLabel: mem2tb { memory = 2000000000000.B }
|
||||
withLabel: mem5tb { memory = 5000000000000.B }
|
||||
withLabel: mem10tb { memory = 10000000000000.B }
|
||||
withLabel: mem20tb { memory = 20000000000000.B }
|
||||
withLabel: mem50tb { memory = 50000000000000.B }
|
||||
withLabel: mem100tb { memory = 100000000000000.B }
|
||||
withLabel: mem200tb { memory = 200000000000000.B }
|
||||
withLabel: mem500tb { memory = 500000000000000.B }
|
||||
withLabel: mem1gib { memory = 1073741824.B }
|
||||
withLabel: mem2gib { memory = 2147483648.B }
|
||||
withLabel: mem4gib { memory = 4294967296.B }
|
||||
withLabel: mem8gib { memory = 8589934592.B }
|
||||
withLabel: mem16gib { memory = 17179869184.B }
|
||||
withLabel: mem32gib { memory = 34359738368.B }
|
||||
withLabel: mem64gib { memory = 68719476736.B }
|
||||
withLabel: mem128gib { memory = 137438953472.B }
|
||||
withLabel: mem256gib { memory = 274877906944.B }
|
||||
withLabel: mem512gib { memory = 549755813888.B }
|
||||
withLabel: mem1tib { memory = 1099511627776.B }
|
||||
withLabel: mem2tib { memory = 2199023255552.B }
|
||||
withLabel: mem4tib { memory = 4398046511104.B }
|
||||
withLabel: mem8tib { memory = 8796093022208.B }
|
||||
withLabel: mem16tib { memory = 17592186044416.B }
|
||||
withLabel: mem32tib { memory = 35184372088832.B }
|
||||
withLabel: mem64tib { memory = 70368744177664.B }
|
||||
withLabel: mem128tib { memory = 140737488355328.B }
|
||||
withLabel: mem256tib { memory = 281474976710656.B }
|
||||
withLabel: mem512tib { memory = 562949953421312.B }
|
||||
withLabel: cpu1 { cpus = 1 }
|
||||
withLabel: cpu2 { cpus = 2 }
|
||||
withLabel: cpu5 { cpus = 5 }
|
||||
withLabel: cpu10 { cpus = 10 }
|
||||
withLabel: cpu20 { cpus = 20 }
|
||||
withLabel: cpu50 { cpus = 50 }
|
||||
withLabel: cpu100 { cpus = 100 }
|
||||
withLabel: cpu200 { cpus = 200 }
|
||||
withLabel: cpu500 { cpus = 500 }
|
||||
withLabel: cpu1000 { cpus = 1000 }
|
||||
}
|
||||
|
||||
includeConfig("nextflow_labels.config")
|
||||
@@ -0,0 +1,66 @@
|
||||
process {
|
||||
// Default resources for components that hardly do any processing
|
||||
memory = { 2.GB * task.attempt }
|
||||
cpus = 1
|
||||
|
||||
// Retry for exit codes that have something to do with memory issues
|
||||
errorStrategy = { task.exitStatus in 137..140 ? 'retry' : 'terminate' }
|
||||
maxRetries = 3
|
||||
maxMemory = null
|
||||
|
||||
// CPU resources
|
||||
withLabel: singlecpu { cpus = 1 }
|
||||
withLabel: lowcpu { cpus = 4 }
|
||||
withLabel: midcpu { cpus = 10 }
|
||||
withLabel: highcpu { cpus = 20 }
|
||||
|
||||
// Memory resources
|
||||
withLabel: lowmem { memory = { get_memory( 4.GB * task.attempt ) } }
|
||||
withLabel: midmem { memory = { get_memory( 25.GB * task.attempt ) } }
|
||||
withLabel: highmem { memory = { get_memory( 50.GB * task.attempt ) } }
|
||||
withLabel: veryhighmem { memory = { get_memory( 75.GB * task.attempt ) } }
|
||||
|
||||
// Disk space
|
||||
withLabel: lowdisk {
|
||||
disk = {process.disk ? process.disk : null}
|
||||
}
|
||||
withLabel: middisk {
|
||||
disk = {process.disk ? process.disk : null}
|
||||
}
|
||||
withLabel: highdisk {
|
||||
disk = {process.disk ? process.disk : null}
|
||||
}
|
||||
withLabel: veryhighdisk {
|
||||
disk = {process.disk ? process.disk : null}
|
||||
}
|
||||
|
||||
// NOTE: The above labels intentionally do not have an effect by default.
|
||||
// The user should set the disk space requirements by adding the following
|
||||
// to the compute environment:
|
||||
//
|
||||
// withLabel: lowdisk { disk = { 20.GB * task.attempt } }
|
||||
// withLabel: middisk { disk = { 100.GB * task.attempt } }
|
||||
// withLabel: highdisk { disk = { 200.GB * task.attempt } }
|
||||
// withLabel: veryhighdisk { disk = { 500.GB * task.attempt } }
|
||||
}
|
||||
|
||||
def get_memory(to_compare) {
|
||||
if (!process.containsKey("maxMemory") || !process.maxMemory) {
|
||||
return to_compare
|
||||
}
|
||||
|
||||
try {
|
||||
if (process.containsKey("maxRetries") && process.maxRetries && task.attempt == (process.maxRetries as int)) {
|
||||
return process.maxMemory
|
||||
}
|
||||
else if (to_compare.compareTo(process.maxMemory as nextflow.util.MemoryUnit) == 1) {
|
||||
return max_memory as nextflow.util.MemoryUnit
|
||||
}
|
||||
else {
|
||||
return to_compare
|
||||
}
|
||||
} catch (all) {
|
||||
println "Error processing memory resources. Please check that process.maxMemory '${process.maxMemory}' and process.maxRetries '${process.maxRetries}' are valid!"
|
||||
System.exit(1)
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,12 @@
|
||||
# Arguments
|
||||
input: # please fill in - example: "input.h5mu"
|
||||
modality: "rna"
|
||||
obsp_connectivities: "connectivities"
|
||||
# output: "$id.$key.output.h5mu"
|
||||
obsm_name: "leiden"
|
||||
resolution: # please fill in - example: [1.0]
|
||||
# output_compression: "gzip"
|
||||
|
||||
# Nextflow input-output arguments
|
||||
publish_dir: # please fill in - example: "output/"
|
||||
# param_list: "my_params.yaml"
|
||||
@@ -0,0 +1,101 @@
|
||||
{
|
||||
"$schema": "https://json-schema.org/draft/2020-12/schema",
|
||||
"title": "leiden",
|
||||
"description": "Cluster cells using the [Leiden algorithm] [Traag18] implemented in the [Scanpy framework] [Wolf18]. \nLeiden is an improved version of the [Louvain algorithm] [Blondel08]. \nIt has been proposed for single-cell analysis by [Levine15] [Levine15]. \nThis requires having ran `neighbors/find_neighbors` or `neighbors/bbknn` first.\n\n[Blondel08]: Blondel et al. (2008), Fast unfolding of communities in large networks, J. Stat. Mech. \n[Levine15]: Levine et al. (2015), Data-Driven Phenotypic Dissection of AML Reveals Progenitor-like Cells that Correlate with Prognosis, Cell. \n[Traag18]: Traag et al. (2018), From Louvain to Leiden: guaranteeing well-connected communities arXiv. \n[Wolf18]: Wolf et al. (2018), Scanpy: large-scale single-cell gene expression data analysis, Genome Biology. \n",
|
||||
"type": "object",
|
||||
"$defs": {
|
||||
"Dataset input": {
|
||||
"title": "Dataset input",
|
||||
"type": "object",
|
||||
"description": "Dataset input using nf-tower \"dataset\" or \"data explorer\". Allows for the input of multiple parameter sets to initialise a Nextflow channel.",
|
||||
"properties": {
|
||||
"param_list": {
|
||||
"description": "Dataset input can either be a list of maps, a csv file, a json file, a yaml file, or simply a yaml blob. The names of the input fields (e.g. csv columns, json keys) need to be an exact match with the workflow input parameters.",
|
||||
"type": "string",
|
||||
"default": "",
|
||||
"format": "file-path",
|
||||
"mimetype": "text/csv"
|
||||
}
|
||||
}
|
||||
},
|
||||
"arguments": {
|
||||
"title": "Arguments",
|
||||
"type": "object",
|
||||
"description": "No description",
|
||||
"properties": {
|
||||
"input": {
|
||||
"type": "string",
|
||||
"format": "path",
|
||||
"exists": true,
|
||||
"description": "Input file.",
|
||||
"help_text": "Type: `file`, multiple: `False`, required, direction: `input`, example: `\"input.h5mu\"`. "
|
||||
},
|
||||
"modality": {
|
||||
"type": "string",
|
||||
"description": "Which modality from the input MuData file to process.\n",
|
||||
"help_text": "Type: `string`, multiple: `False`, default: `\"rna\"`. ",
|
||||
"default": "rna"
|
||||
},
|
||||
"obsp_connectivities": {
|
||||
"type": "string",
|
||||
"description": "In which .obsp slot the neighbor connectivities can be found.",
|
||||
"help_text": "Type: `string`, multiple: `False`, default: `\"connectivities\"`. ",
|
||||
"default": "connectivities"
|
||||
},
|
||||
"output": {
|
||||
"type": "string",
|
||||
"format": "path",
|
||||
"description": "Output file.",
|
||||
"help_text": "Type: `file`, multiple: `False`, required, default: `\"$id.$key.output.h5mu\"`, direction: `output`, example: `\"output.h5mu\"`. ",
|
||||
"default": "$id.$key.output.h5mu"
|
||||
},
|
||||
"obsm_name": {
|
||||
"type": "string",
|
||||
"description": "Name of the .obsm key under which to add the cluster labels.\nThe name of the columns in the matrix will correspond to the resolutions.\n",
|
||||
"help_text": "Type: `string`, multiple: `False`, default: `\"leiden\"`. ",
|
||||
"default": "leiden"
|
||||
},
|
||||
"resolution": {
|
||||
"type": "array",
|
||||
"items": {
|
||||
"type": "number"
|
||||
},
|
||||
"description": "A parameter value controlling the coarseness of the clustering",
|
||||
"help_text": "Type: `double`, multiple: `True`, required, default: `[1.0]`. ",
|
||||
"default": [
|
||||
1.0
|
||||
]
|
||||
},
|
||||
"output_compression": {
|
||||
"type": "string",
|
||||
"description": "Compression format to use for the output AnnData and/or Mudata objects.\nBy default no compression is applied.\n",
|
||||
"help_text": "Type: `string`, multiple: `False`, example: `\"gzip\"`, choices: ``gzip`, `lzf``. ",
|
||||
"enum": [
|
||||
"gzip",
|
||||
"lzf"
|
||||
]
|
||||
}
|
||||
}
|
||||
},
|
||||
"nextflow input-output arguments": {
|
||||
"title": "Nextflow input-output arguments",
|
||||
"type": "object",
|
||||
"description": "Input/output parameters for Nextflow itself. Please note that both publishDir and publish_dir are supported but at least one has to be configured.",
|
||||
"properties": {
|
||||
"publish_dir": {
|
||||
"type": "string",
|
||||
"description": "Path to an output directory.",
|
||||
"help_text": "Type: `string`, multiple: `False`, required, example: `\"output/\"`. "
|
||||
}
|
||||
}
|
||||
}
|
||||
},
|
||||
"allOf": [
|
||||
{
|
||||
"$ref": "#/$defs/arguments"
|
||||
},
|
||||
{
|
||||
"$ref": "#/$defs/nextflow input-output arguments"
|
||||
}
|
||||
]
|
||||
}
|
||||
@@ -0,0 +1,12 @@
|
||||
def setup_logger():
|
||||
import logging
|
||||
from sys import stdout
|
||||
|
||||
logger = logging.getLogger()
|
||||
logger.setLevel(logging.INFO)
|
||||
console_handler = logging.StreamHandler(stdout)
|
||||
logFormatter = logging.Formatter("%(asctime)s %(levelname)-8s %(message)s")
|
||||
console_handler.setFormatter(logFormatter)
|
||||
logger.addHandler(console_handler)
|
||||
|
||||
return logger
|
||||
@@ -0,0 +1,328 @@
|
||||
name: "concatenate_h5mu"
|
||||
namespace: "dataflow"
|
||||
version: "disable-scrublet_build"
|
||||
authors:
|
||||
- name: "Dries Schaumont"
|
||||
roles:
|
||||
- "maintainer"
|
||||
info:
|
||||
role: "Core Team Member"
|
||||
links:
|
||||
email: "dries@data-intuitive.com"
|
||||
github: "DriesSchaumont"
|
||||
orcid: "0000-0002-4389-0440"
|
||||
linkedin: "dries-schaumont"
|
||||
organizations:
|
||||
- name: "Data Intuitive"
|
||||
href: "https://www.data-intuitive.com"
|
||||
role: "Data Scientist"
|
||||
argument_groups:
|
||||
- name: "Arguments"
|
||||
arguments:
|
||||
- type: "file"
|
||||
name: "--input"
|
||||
alternatives:
|
||||
- "-i"
|
||||
description: "Paths to the different samples to be concatenated."
|
||||
info: null
|
||||
example:
|
||||
- "sample_paths"
|
||||
must_exist: true
|
||||
create_parent: true
|
||||
required: true
|
||||
direction: "input"
|
||||
multiple: true
|
||||
multiple_sep: ";"
|
||||
- type: "string"
|
||||
name: "--modality"
|
||||
description: "Only output concatenated objects for the provided modalities. Outputs\
|
||||
\ all modalities by default."
|
||||
info: null
|
||||
required: false
|
||||
direction: "input"
|
||||
multiple: true
|
||||
multiple_sep: ";"
|
||||
- type: "string"
|
||||
name: "--input_id"
|
||||
description: "Names of the different samples that have to be concatenated. Must\
|
||||
\ be specified when using '--mode move'.\nIn this case, the ids will be used\
|
||||
\ for the columns names of the dataframes registring the conflicts.\nIf specified,\
|
||||
\ must be of same length as `--input`.\n"
|
||||
info: null
|
||||
required: false
|
||||
direction: "input"
|
||||
multiple: true
|
||||
multiple_sep: ";"
|
||||
- type: "file"
|
||||
name: "--output"
|
||||
alternatives:
|
||||
- "-o"
|
||||
description: "Output location for the concatenated MuData object file.\n"
|
||||
info: null
|
||||
example:
|
||||
- "output.h5mu"
|
||||
must_exist: true
|
||||
create_parent: true
|
||||
required: false
|
||||
direction: "output"
|
||||
multiple: false
|
||||
multiple_sep: ";"
|
||||
- type: "string"
|
||||
name: "--obs_sample_name"
|
||||
description: "Name of the .obs key under which to add the sample names."
|
||||
info: null
|
||||
default:
|
||||
- "sample_id"
|
||||
required: false
|
||||
direction: "input"
|
||||
multiple: false
|
||||
multiple_sep: ";"
|
||||
- type: "string"
|
||||
name: "--other_axis_mode"
|
||||
description: "How to handle the merging of other axis (var, obs, ...).\n\n -\
|
||||
\ None: keep no data\n - same: only keep elements of the matrices which are\
|
||||
\ the same in each of the samples\n - unique: only keep elements for which\
|
||||
\ there is only 1 possible value (1 value that can occur in multiple samples)\n\
|
||||
\ - first: keep the annotation from the first sample\n - only: keep elements\
|
||||
\ that show up in only one of the objects (1 unique element in only 1 sample)\n\
|
||||
\ - move: identical to 'same', but moving the conflicting values to .varm or\
|
||||
\ .obsm\n"
|
||||
info: null
|
||||
default:
|
||||
- "move"
|
||||
required: false
|
||||
choices:
|
||||
- "same"
|
||||
- "unique"
|
||||
- "first"
|
||||
- "only"
|
||||
- "concat"
|
||||
- "move"
|
||||
direction: "input"
|
||||
multiple: false
|
||||
multiple_sep: ";"
|
||||
- type: "string"
|
||||
name: "--uns_merge_mode"
|
||||
description: "How to handle the merging of .uns across modalities\n - None: keep\
|
||||
\ no data\n - same: only keep elements of the matrices which are the same in\
|
||||
\ each of the samples\n - unique: only keep elements for which there is only\
|
||||
\ 1 possible value (1 value that can occur in multiple samples)\n - first:\
|
||||
\ keep the annotation from the first sample\n - only: keep elements that show\
|
||||
\ up in only one of the objects (1 unique element in only 1 sample)\n - make_unique:\
|
||||
\ identical to 'unique', but keys which are not unique are made unique by prefixing\
|
||||
\ them with the sample id.\n"
|
||||
info: null
|
||||
default:
|
||||
- "make_unique"
|
||||
required: false
|
||||
choices:
|
||||
- "same"
|
||||
- "unique"
|
||||
- "first"
|
||||
- "only"
|
||||
- "make_unique"
|
||||
direction: "input"
|
||||
multiple: false
|
||||
multiple_sep: ";"
|
||||
- type: "string"
|
||||
name: "--output_compression"
|
||||
description: "Compression format to use for the output AnnData and/or Mudata objects.\n\
|
||||
By default no compression is applied.\n"
|
||||
info: null
|
||||
example:
|
||||
- "gzip"
|
||||
required: false
|
||||
choices:
|
||||
- "gzip"
|
||||
- "lzf"
|
||||
direction: "input"
|
||||
multiple: false
|
||||
multiple_sep: ";"
|
||||
resources:
|
||||
- type: "python_script"
|
||||
path: "script.py"
|
||||
is_executable: true
|
||||
- type: "file"
|
||||
path: "setup_logger.py"
|
||||
- type: "file"
|
||||
path: "compress_h5mu.py"
|
||||
- type: "file"
|
||||
path: "nextflow_labels.config"
|
||||
dest: "nextflow_labels.config"
|
||||
description: "Concatenate observations from samples in several (uni- and/or multi-modal)\
|
||||
\ MuData files into a single file.\n"
|
||||
test_resources:
|
||||
- type: "python_script"
|
||||
path: "test.py"
|
||||
is_executable: true
|
||||
- type: "file"
|
||||
path: "e18_mouse_brain_fresh_5k_filtered_feature_bc_matrix_subset_unique_obs.h5mu"
|
||||
- type: "file"
|
||||
path: "human_brain_3k_filtered_feature_bc_matrix_subset_unique_obs.h5mu"
|
||||
info: null
|
||||
status: "enabled"
|
||||
scope:
|
||||
image: "public"
|
||||
target: "public"
|
||||
license: "MIT"
|
||||
links:
|
||||
repository: "https://github.com/openpipelines-bio/openpipeline"
|
||||
docker_registry: "ghcr.io"
|
||||
runners:
|
||||
- type: "executable"
|
||||
id: "executable"
|
||||
docker_setup_strategy: "ifneedbepullelsecachedbuild"
|
||||
- type: "nextflow"
|
||||
id: "nextflow"
|
||||
directives:
|
||||
label:
|
||||
- "midcpu"
|
||||
- "highmem"
|
||||
tag: "$id"
|
||||
auto:
|
||||
simplifyInput: true
|
||||
simplifyOutput: false
|
||||
transcript: false
|
||||
publish: false
|
||||
config:
|
||||
labels:
|
||||
mem1gb: "memory = 1000000000.B"
|
||||
mem2gb: "memory = 2000000000.B"
|
||||
mem5gb: "memory = 5000000000.B"
|
||||
mem10gb: "memory = 10000000000.B"
|
||||
mem20gb: "memory = 20000000000.B"
|
||||
mem50gb: "memory = 50000000000.B"
|
||||
mem100gb: "memory = 100000000000.B"
|
||||
mem200gb: "memory = 200000000000.B"
|
||||
mem500gb: "memory = 500000000000.B"
|
||||
mem1tb: "memory = 1000000000000.B"
|
||||
mem2tb: "memory = 2000000000000.B"
|
||||
mem5tb: "memory = 5000000000000.B"
|
||||
mem10tb: "memory = 10000000000000.B"
|
||||
mem20tb: "memory = 20000000000000.B"
|
||||
mem50tb: "memory = 50000000000000.B"
|
||||
mem100tb: "memory = 100000000000000.B"
|
||||
mem200tb: "memory = 200000000000000.B"
|
||||
mem500tb: "memory = 500000000000000.B"
|
||||
mem1gib: "memory = 1073741824.B"
|
||||
mem2gib: "memory = 2147483648.B"
|
||||
mem4gib: "memory = 4294967296.B"
|
||||
mem8gib: "memory = 8589934592.B"
|
||||
mem16gib: "memory = 17179869184.B"
|
||||
mem32gib: "memory = 34359738368.B"
|
||||
mem64gib: "memory = 68719476736.B"
|
||||
mem128gib: "memory = 137438953472.B"
|
||||
mem256gib: "memory = 274877906944.B"
|
||||
mem512gib: "memory = 549755813888.B"
|
||||
mem1tib: "memory = 1099511627776.B"
|
||||
mem2tib: "memory = 2199023255552.B"
|
||||
mem4tib: "memory = 4398046511104.B"
|
||||
mem8tib: "memory = 8796093022208.B"
|
||||
mem16tib: "memory = 17592186044416.B"
|
||||
mem32tib: "memory = 35184372088832.B"
|
||||
mem64tib: "memory = 70368744177664.B"
|
||||
mem128tib: "memory = 140737488355328.B"
|
||||
mem256tib: "memory = 281474976710656.B"
|
||||
mem512tib: "memory = 562949953421312.B"
|
||||
cpu1: "cpus = 1"
|
||||
cpu2: "cpus = 2"
|
||||
cpu5: "cpus = 5"
|
||||
cpu10: "cpus = 10"
|
||||
cpu20: "cpus = 20"
|
||||
cpu50: "cpus = 50"
|
||||
cpu100: "cpus = 100"
|
||||
cpu200: "cpus = 200"
|
||||
cpu500: "cpus = 500"
|
||||
cpu1000: "cpus = 1000"
|
||||
script:
|
||||
- "includeConfig(\"nextflow_labels.config\")"
|
||||
debug: false
|
||||
container: "docker"
|
||||
engines:
|
||||
- type: "docker"
|
||||
id: "docker"
|
||||
image: "python:3.11-slim"
|
||||
target_tag: "disable-scrublet_build"
|
||||
namespace_separator: "/"
|
||||
setup:
|
||||
- type: "apt"
|
||||
packages:
|
||||
- "procps"
|
||||
interactive: false
|
||||
- type: "python"
|
||||
user: false
|
||||
packages:
|
||||
- "anndata~=0.11.1"
|
||||
- "mudata~=0.3.1"
|
||||
- "pandas~=2.1.1"
|
||||
script:
|
||||
- "exec(\"try:\\n import awkward\\nexcept ModuleNotFoundError:\\n exit(0)\\\
|
||||
nelse: exit(1)\")"
|
||||
upgrade: true
|
||||
test_setup:
|
||||
- type: "apt"
|
||||
packages:
|
||||
- "git"
|
||||
interactive: false
|
||||
- type: "python"
|
||||
user: false
|
||||
packages:
|
||||
- "viashpy==0.8.0"
|
||||
github:
|
||||
- "openpipelines-bio/core#subdirectory=packages/python/openpipeline_testutils"
|
||||
upgrade: true
|
||||
- type: "python"
|
||||
user: false
|
||||
packages:
|
||||
- "viashpy==0.8.0"
|
||||
upgrade: true
|
||||
entrypoint: []
|
||||
cmd: null
|
||||
build_info:
|
||||
config: "src/dataflow/concatenate_h5mu/config.vsh.yaml"
|
||||
runner: "nextflow"
|
||||
engine: "docker"
|
||||
output: "target/nextflow/dataflow/concatenate_h5mu"
|
||||
executable: "target/nextflow/dataflow/concatenate_h5mu/main.nf"
|
||||
viash_version: "0.9.4"
|
||||
git_commit: "07297b53180b28c8e198414328683e941eec7ed0"
|
||||
git_remote: "https://github.com/openpipelines-bio/openpipeline"
|
||||
git_tag: "0.2.0-2044-g07297b53180"
|
||||
package_config:
|
||||
name: "openpipeline"
|
||||
summary: "Best-practice workflows for single-cell multi-omics analyses.\n"
|
||||
description: "OpenPipelines are extensible single cell analysis pipelines for reproducible\
|
||||
\ and large-scale single cell processing using [Viash](https://viash.io) and [Nextflow](https://www.nextflow.io/).\n\
|
||||
\nIn terms of workflows, the following has been made available, but keep in mind\
|
||||
\ that\nindividual tools and functionality can be executed as standalone components\
|
||||
\ as well.\n\n * Demultiplexing: conversion of raw sequencing data to FASTQ objects.\n\
|
||||
\ * Ingestion: Read mapping and generating a count matrix.\n * Single sample\
|
||||
\ processing: cell filtering and doublet detection.\n * Multisample processing:\
|
||||
\ Count transformation, normalization, QC metric calulations.\n * Integration:\
|
||||
\ Clustering, integration and batch correction using single and multimodal methods.\n\
|
||||
\ * Downstream analysis workflows\n"
|
||||
info:
|
||||
test_resources:
|
||||
- type: "s3"
|
||||
path: "s3://openpipelines-data"
|
||||
dest: "resources_test"
|
||||
viash_version: "0.9.4"
|
||||
source: "src"
|
||||
target: "target"
|
||||
config_mods:
|
||||
- ".resources += {path: '/src/workflows/utils/labels.config', dest: 'nextflow_labels.config'}\n\
|
||||
.runners[.type == 'nextflow'].config.script := 'includeConfig(\"nextflow_labels.config\"\
|
||||
)'"
|
||||
- ".version := \"disable-scrublet_build\""
|
||||
- ".engines[.type == 'docker'].target_tag := 'disable-scrublet_build'"
|
||||
keywords:
|
||||
- "single-cell"
|
||||
- "multimodal"
|
||||
license: "MIT"
|
||||
organization: "openpipelines-bio"
|
||||
links:
|
||||
repository: "https://github.com/openpipelines-bio/openpipeline"
|
||||
docker_registry: "ghcr.io"
|
||||
homepage: "https://openpipelines.bio"
|
||||
documentation: "https://openpipelines.bio/fundamentals"
|
||||
issue_tracker: "https://github.com/openpipelines-bio/openpipeline/issues"
|
||||
@@ -0,0 +1,87 @@
|
||||
import shutil
|
||||
from anndata import AnnData
|
||||
from mudata import write_h5ad
|
||||
from h5py import File as H5File
|
||||
from h5py import Group, Dataset
|
||||
from pathlib import Path
|
||||
from typing import Union, Literal
|
||||
from functools import partial
|
||||
|
||||
|
||||
def compress_h5mu(
|
||||
input_path: Union[str, Path],
|
||||
output_path: Union[str, Path],
|
||||
compression: Union[Literal["gzip"], Literal["lzf"]],
|
||||
):
|
||||
input_path, output_path = str(input_path), str(output_path)
|
||||
|
||||
def copy_attributes(in_object, out_object):
|
||||
for key, value in in_object.attrs.items():
|
||||
out_object.attrs[key] = value
|
||||
|
||||
def visit_path(
|
||||
output_h5: H5File,
|
||||
compression: Union[Literal["gzip"], Literal["lzf"]],
|
||||
name: str,
|
||||
object: Union[Group, Dataset],
|
||||
):
|
||||
if isinstance(object, Group):
|
||||
new_group = output_h5.create_group(name)
|
||||
copy_attributes(object, new_group)
|
||||
elif isinstance(object, Dataset):
|
||||
# Compression only works for non-scalar Dataset objects
|
||||
# Scalar objects dont have a shape defined
|
||||
if not object.compression and object.shape not in [None, ()]:
|
||||
new_dataset = output_h5.create_dataset(
|
||||
name, data=object, compression=compression
|
||||
)
|
||||
copy_attributes(object, new_dataset)
|
||||
else:
|
||||
output_h5.copy(object, name)
|
||||
else:
|
||||
raise NotImplementedError(
|
||||
f"Could not copy element {name}, "
|
||||
f"type has not been implemented yet: {type(object)}"
|
||||
)
|
||||
|
||||
with (
|
||||
H5File(input_path, "r") as input_h5,
|
||||
H5File(output_path, "w", userblock_size=512) as output_h5,
|
||||
):
|
||||
copy_attributes(input_h5, output_h5)
|
||||
input_h5.visititems(partial(visit_path, output_h5, compression))
|
||||
|
||||
with open(input_path, "rb") as input_bytes:
|
||||
# Mudata puts metadata like this in the first 512 bytes:
|
||||
# MuData (format-version=0.1.0;creator=muon;creator-version=0.2.0)
|
||||
# See mudata/_core/io.py, read_h5mu() function
|
||||
starting_metadata = input_bytes.read(100)
|
||||
# The metadata is padded with extra null bytes up until 512 bytes
|
||||
truncate_location = starting_metadata.find(b"\x00")
|
||||
starting_metadata = starting_metadata[:truncate_location]
|
||||
with open(output_path, "br+") as f:
|
||||
nbytes = f.write(starting_metadata)
|
||||
f.write(b"\0" * (512 - nbytes))
|
||||
|
||||
|
||||
def write_h5ad_to_h5mu_with_compression(
|
||||
output_file: Union[str, Path],
|
||||
h5mu: Union[str, Path],
|
||||
modality_name: str,
|
||||
modality_data: AnnData,
|
||||
output_compression=None,
|
||||
):
|
||||
output_file = Path(output_file)
|
||||
h5mu = Path(h5mu)
|
||||
output_file_uncompressed = (
|
||||
output_file.with_name(output_file.stem + "_uncompressed.h5mu")
|
||||
if output_compression
|
||||
else output_file
|
||||
)
|
||||
shutil.copyfile(h5mu, output_file_uncompressed)
|
||||
write_h5ad(filename=output_file_uncompressed, mod=modality_name, data=modality_data)
|
||||
if output_compression:
|
||||
compress_h5mu(
|
||||
output_file_uncompressed, output_file, compression=output_compression
|
||||
)
|
||||
output_file_uncompressed.unlink()
|
||||
File diff suppressed because it is too large
Load Diff
@@ -0,0 +1,126 @@
|
||||
manifest {
|
||||
name = 'dataflow/concatenate_h5mu'
|
||||
mainScript = 'main.nf'
|
||||
nextflowVersion = '!>=20.12.1-edge'
|
||||
version = 'disable-scrublet_build'
|
||||
description = 'Concatenate observations from samples in several (uni- and/or multi-modal) MuData files into a single file.\n'
|
||||
author = 'Dries Schaumont'
|
||||
}
|
||||
|
||||
process.container = 'nextflow/bash:latest'
|
||||
|
||||
// detect tempdir
|
||||
tempDir = java.nio.file.Paths.get(
|
||||
System.getenv('NXF_TEMP') ?:
|
||||
System.getenv('VIASH_TEMP') ?:
|
||||
System.getenv('TEMPDIR') ?:
|
||||
System.getenv('TMPDIR') ?:
|
||||
'/tmp'
|
||||
).toAbsolutePath()
|
||||
|
||||
profiles {
|
||||
no_publish {
|
||||
process {
|
||||
withName: '.*' {
|
||||
publishDir = [
|
||||
enabled: false
|
||||
]
|
||||
}
|
||||
}
|
||||
}
|
||||
mount_temp {
|
||||
docker.temp = tempDir
|
||||
podman.temp = tempDir
|
||||
charliecloud.temp = tempDir
|
||||
}
|
||||
docker {
|
||||
docker.enabled = true
|
||||
// docker.userEmulation = true
|
||||
singularity.enabled = false
|
||||
podman.enabled = false
|
||||
shifter.enabled = false
|
||||
charliecloud.enabled = false
|
||||
}
|
||||
singularity {
|
||||
singularity.enabled = true
|
||||
singularity.autoMounts = true
|
||||
docker.enabled = false
|
||||
podman.enabled = false
|
||||
shifter.enabled = false
|
||||
charliecloud.enabled = false
|
||||
}
|
||||
podman {
|
||||
podman.enabled = true
|
||||
docker.enabled = false
|
||||
singularity.enabled = false
|
||||
shifter.enabled = false
|
||||
charliecloud.enabled = false
|
||||
}
|
||||
shifter {
|
||||
shifter.enabled = true
|
||||
docker.enabled = false
|
||||
singularity.enabled = false
|
||||
podman.enabled = false
|
||||
charliecloud.enabled = false
|
||||
}
|
||||
charliecloud {
|
||||
charliecloud.enabled = true
|
||||
docker.enabled = false
|
||||
singularity.enabled = false
|
||||
podman.enabled = false
|
||||
shifter.enabled = false
|
||||
}
|
||||
}
|
||||
|
||||
process{
|
||||
withLabel: mem1gb { memory = 1000000000.B }
|
||||
withLabel: mem2gb { memory = 2000000000.B }
|
||||
withLabel: mem5gb { memory = 5000000000.B }
|
||||
withLabel: mem10gb { memory = 10000000000.B }
|
||||
withLabel: mem20gb { memory = 20000000000.B }
|
||||
withLabel: mem50gb { memory = 50000000000.B }
|
||||
withLabel: mem100gb { memory = 100000000000.B }
|
||||
withLabel: mem200gb { memory = 200000000000.B }
|
||||
withLabel: mem500gb { memory = 500000000000.B }
|
||||
withLabel: mem1tb { memory = 1000000000000.B }
|
||||
withLabel: mem2tb { memory = 2000000000000.B }
|
||||
withLabel: mem5tb { memory = 5000000000000.B }
|
||||
withLabel: mem10tb { memory = 10000000000000.B }
|
||||
withLabel: mem20tb { memory = 20000000000000.B }
|
||||
withLabel: mem50tb { memory = 50000000000000.B }
|
||||
withLabel: mem100tb { memory = 100000000000000.B }
|
||||
withLabel: mem200tb { memory = 200000000000000.B }
|
||||
withLabel: mem500tb { memory = 500000000000000.B }
|
||||
withLabel: mem1gib { memory = 1073741824.B }
|
||||
withLabel: mem2gib { memory = 2147483648.B }
|
||||
withLabel: mem4gib { memory = 4294967296.B }
|
||||
withLabel: mem8gib { memory = 8589934592.B }
|
||||
withLabel: mem16gib { memory = 17179869184.B }
|
||||
withLabel: mem32gib { memory = 34359738368.B }
|
||||
withLabel: mem64gib { memory = 68719476736.B }
|
||||
withLabel: mem128gib { memory = 137438953472.B }
|
||||
withLabel: mem256gib { memory = 274877906944.B }
|
||||
withLabel: mem512gib { memory = 549755813888.B }
|
||||
withLabel: mem1tib { memory = 1099511627776.B }
|
||||
withLabel: mem2tib { memory = 2199023255552.B }
|
||||
withLabel: mem4tib { memory = 4398046511104.B }
|
||||
withLabel: mem8tib { memory = 8796093022208.B }
|
||||
withLabel: mem16tib { memory = 17592186044416.B }
|
||||
withLabel: mem32tib { memory = 35184372088832.B }
|
||||
withLabel: mem64tib { memory = 70368744177664.B }
|
||||
withLabel: mem128tib { memory = 140737488355328.B }
|
||||
withLabel: mem256tib { memory = 281474976710656.B }
|
||||
withLabel: mem512tib { memory = 562949953421312.B }
|
||||
withLabel: cpu1 { cpus = 1 }
|
||||
withLabel: cpu2 { cpus = 2 }
|
||||
withLabel: cpu5 { cpus = 5 }
|
||||
withLabel: cpu10 { cpus = 10 }
|
||||
withLabel: cpu20 { cpus = 20 }
|
||||
withLabel: cpu50 { cpus = 50 }
|
||||
withLabel: cpu100 { cpus = 100 }
|
||||
withLabel: cpu200 { cpus = 200 }
|
||||
withLabel: cpu500 { cpus = 500 }
|
||||
withLabel: cpu1000 { cpus = 1000 }
|
||||
}
|
||||
|
||||
includeConfig("nextflow_labels.config")
|
||||
@@ -0,0 +1,66 @@
|
||||
process {
|
||||
// Default resources for components that hardly do any processing
|
||||
memory = { 2.GB * task.attempt }
|
||||
cpus = 1
|
||||
|
||||
// Retry for exit codes that have something to do with memory issues
|
||||
errorStrategy = { task.exitStatus in 137..140 ? 'retry' : 'terminate' }
|
||||
maxRetries = 3
|
||||
maxMemory = null
|
||||
|
||||
// CPU resources
|
||||
withLabel: singlecpu { cpus = 1 }
|
||||
withLabel: lowcpu { cpus = 4 }
|
||||
withLabel: midcpu { cpus = 10 }
|
||||
withLabel: highcpu { cpus = 20 }
|
||||
|
||||
// Memory resources
|
||||
withLabel: lowmem { memory = { get_memory( 4.GB * task.attempt ) } }
|
||||
withLabel: midmem { memory = { get_memory( 25.GB * task.attempt ) } }
|
||||
withLabel: highmem { memory = { get_memory( 50.GB * task.attempt ) } }
|
||||
withLabel: veryhighmem { memory = { get_memory( 75.GB * task.attempt ) } }
|
||||
|
||||
// Disk space
|
||||
withLabel: lowdisk {
|
||||
disk = {process.disk ? process.disk : null}
|
||||
}
|
||||
withLabel: middisk {
|
||||
disk = {process.disk ? process.disk : null}
|
||||
}
|
||||
withLabel: highdisk {
|
||||
disk = {process.disk ? process.disk : null}
|
||||
}
|
||||
withLabel: veryhighdisk {
|
||||
disk = {process.disk ? process.disk : null}
|
||||
}
|
||||
|
||||
// NOTE: The above labels intentionally do not have an effect by default.
|
||||
// The user should set the disk space requirements by adding the following
|
||||
// to the compute environment:
|
||||
//
|
||||
// withLabel: lowdisk { disk = { 20.GB * task.attempt } }
|
||||
// withLabel: middisk { disk = { 100.GB * task.attempt } }
|
||||
// withLabel: highdisk { disk = { 200.GB * task.attempt } }
|
||||
// withLabel: veryhighdisk { disk = { 500.GB * task.attempt } }
|
||||
}
|
||||
|
||||
def get_memory(to_compare) {
|
||||
if (!process.containsKey("maxMemory") || !process.maxMemory) {
|
||||
return to_compare
|
||||
}
|
||||
|
||||
try {
|
||||
if (process.containsKey("maxRetries") && process.maxRetries && task.attempt == (process.maxRetries as int)) {
|
||||
return process.maxMemory
|
||||
}
|
||||
else if (to_compare.compareTo(process.maxMemory as nextflow.util.MemoryUnit) == 1) {
|
||||
return max_memory as nextflow.util.MemoryUnit
|
||||
}
|
||||
else {
|
||||
return to_compare
|
||||
}
|
||||
} catch (all) {
|
||||
println "Error processing memory resources. Please check that process.maxMemory '${process.maxMemory}' and process.maxRetries '${process.maxRetries}' are valid!"
|
||||
System.exit(1)
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,13 @@
|
||||
# Arguments
|
||||
input: # please fill in - example: ["sample_paths"]
|
||||
# modality: ["foo"]
|
||||
# input_id: ["foo"]
|
||||
# output: "$id.$key.output.h5mu"
|
||||
obs_sample_name: "sample_id"
|
||||
other_axis_mode: "move"
|
||||
uns_merge_mode: "make_unique"
|
||||
# output_compression: "gzip"
|
||||
|
||||
# Nextflow input-output arguments
|
||||
publish_dir: # please fill in - example: "output/"
|
||||
# param_list: "my_params.yaml"
|
||||
@@ -0,0 +1,124 @@
|
||||
{
|
||||
"$schema": "https://json-schema.org/draft/2020-12/schema",
|
||||
"title": "concatenate_h5mu",
|
||||
"description": "Concatenate observations from samples in several (uni- and/or multi-modal) MuData files into a single file.\n",
|
||||
"type": "object",
|
||||
"$defs": {
|
||||
"Dataset input": {
|
||||
"title": "Dataset input",
|
||||
"type": "object",
|
||||
"description": "Dataset input using nf-tower \"dataset\" or \"data explorer\". Allows for the input of multiple parameter sets to initialise a Nextflow channel.",
|
||||
"properties": {
|
||||
"param_list": {
|
||||
"description": "Dataset input can either be a list of maps, a csv file, a json file, a yaml file, or simply a yaml blob. The names of the input fields (e.g. csv columns, json keys) need to be an exact match with the workflow input parameters.",
|
||||
"type": "string",
|
||||
"default": "",
|
||||
"format": "file-path",
|
||||
"mimetype": "text/csv"
|
||||
}
|
||||
}
|
||||
},
|
||||
"arguments": {
|
||||
"title": "Arguments",
|
||||
"type": "object",
|
||||
"description": "No description",
|
||||
"properties": {
|
||||
"input": {
|
||||
"type": "array",
|
||||
"items": {
|
||||
"type": "string"
|
||||
},
|
||||
"format": "path",
|
||||
"exists": true,
|
||||
"description": "Paths to the different samples to be concatenated.",
|
||||
"help_text": "Type: `file`, multiple: `True`, required, direction: `input`, example: `[\"sample_paths\"]`. "
|
||||
},
|
||||
"modality": {
|
||||
"type": "array",
|
||||
"items": {
|
||||
"type": "string"
|
||||
},
|
||||
"description": "Only output concatenated objects for the provided modalities",
|
||||
"help_text": "Type: `string`, multiple: `True`. "
|
||||
},
|
||||
"input_id": {
|
||||
"type": "array",
|
||||
"items": {
|
||||
"type": "string"
|
||||
},
|
||||
"description": "Names of the different samples that have to be concatenated",
|
||||
"help_text": "Type: `string`, multiple: `True`. "
|
||||
},
|
||||
"output": {
|
||||
"type": "string",
|
||||
"format": "path",
|
||||
"description": "Output location for the concatenated MuData object file.\n",
|
||||
"help_text": "Type: `file`, multiple: `False`, default: `\"$id.$key.output.h5mu\"`, direction: `output`, example: `\"output.h5mu\"`. ",
|
||||
"default": "$id.$key.output.h5mu"
|
||||
},
|
||||
"obs_sample_name": {
|
||||
"type": "string",
|
||||
"description": "Name of the .obs key under which to add the sample names.",
|
||||
"help_text": "Type: `string`, multiple: `False`, default: `\"sample_id\"`. ",
|
||||
"default": "sample_id"
|
||||
},
|
||||
"other_axis_mode": {
|
||||
"type": "string",
|
||||
"description": "How to handle the merging of other axis (var, obs, ...).\n\n - None: keep no data\n - same: only keep elements of the matrices which are the same in each of the samples\n - unique: only keep elements for which there is only 1 possible value (1 value that can occur in multiple samples)\n - first: keep the annotation from the first sample\n - only: keep elements that show up in only one of the objects (1 unique element in only 1 sample)\n - move: identical to 'same', but moving the conflicting values to .varm or .obsm\n",
|
||||
"help_text": "Type: `string`, multiple: `False`, default: `\"move\"`, choices: ``same`, `unique`, `first`, `only`, `concat`, `move``. ",
|
||||
"enum": [
|
||||
"same",
|
||||
"unique",
|
||||
"first",
|
||||
"only",
|
||||
"concat",
|
||||
"move"
|
||||
],
|
||||
"default": "move"
|
||||
},
|
||||
"uns_merge_mode": {
|
||||
"type": "string",
|
||||
"description": "How to handle the merging of .uns across modalities\n - None: keep no data\n - same: only keep elements of the matrices which are the same in each of the samples\n - unique: only keep elements for which there is only 1 possible value (1 value that can occur in multiple samples)\n - first: keep the annotation from the first sample\n - only: keep elements that show up in only one of the objects (1 unique element in only 1 sample)\n - make_unique: identical to 'unique', but keys which are not unique are made unique by prefixing them with the sample id.\n",
|
||||
"help_text": "Type: `string`, multiple: `False`, default: `\"make_unique\"`, choices: ``same`, `unique`, `first`, `only`, `make_unique``. ",
|
||||
"enum": [
|
||||
"same",
|
||||
"unique",
|
||||
"first",
|
||||
"only",
|
||||
"make_unique"
|
||||
],
|
||||
"default": "make_unique"
|
||||
},
|
||||
"output_compression": {
|
||||
"type": "string",
|
||||
"description": "Compression format to use for the output AnnData and/or Mudata objects.\nBy default no compression is applied.\n",
|
||||
"help_text": "Type: `string`, multiple: `False`, example: `\"gzip\"`, choices: ``gzip`, `lzf``. ",
|
||||
"enum": [
|
||||
"gzip",
|
||||
"lzf"
|
||||
]
|
||||
}
|
||||
}
|
||||
},
|
||||
"nextflow input-output arguments": {
|
||||
"title": "Nextflow input-output arguments",
|
||||
"type": "object",
|
||||
"description": "Input/output parameters for Nextflow itself. Please note that both publishDir and publish_dir are supported but at least one has to be configured.",
|
||||
"properties": {
|
||||
"publish_dir": {
|
||||
"type": "string",
|
||||
"description": "Path to an output directory.",
|
||||
"help_text": "Type: `string`, multiple: `False`, required, example: `\"output/\"`. "
|
||||
}
|
||||
}
|
||||
}
|
||||
},
|
||||
"allOf": [
|
||||
{
|
||||
"$ref": "#/$defs/arguments"
|
||||
},
|
||||
{
|
||||
"$ref": "#/$defs/nextflow input-output arguments"
|
||||
}
|
||||
]
|
||||
}
|
||||
@@ -0,0 +1,12 @@
|
||||
def setup_logger():
|
||||
import logging
|
||||
from sys import stdout
|
||||
|
||||
logger = logging.getLogger()
|
||||
logger.setLevel(logging.INFO)
|
||||
console_handler = logging.StreamHandler(stdout)
|
||||
logFormatter = logging.Formatter("%(asctime)s %(levelname)-8s %(message)s")
|
||||
console_handler.setFormatter(logFormatter)
|
||||
logger.addHandler(console_handler)
|
||||
|
||||
return logger
|
||||
@@ -0,0 +1,242 @@
|
||||
name: "merge"
|
||||
namespace: "dataflow"
|
||||
version: "disable-scrublet_build"
|
||||
authors:
|
||||
- name: "Dries Schaumont"
|
||||
roles:
|
||||
- "maintainer"
|
||||
info:
|
||||
role: "Core Team Member"
|
||||
links:
|
||||
email: "dries@data-intuitive.com"
|
||||
github: "DriesSchaumont"
|
||||
orcid: "0000-0002-4389-0440"
|
||||
linkedin: "dries-schaumont"
|
||||
organizations:
|
||||
- name: "Data Intuitive"
|
||||
href: "https://www.data-intuitive.com"
|
||||
role: "Data Scientist"
|
||||
argument_groups:
|
||||
- name: "Arguments"
|
||||
arguments:
|
||||
- type: "file"
|
||||
name: "--input"
|
||||
alternatives:
|
||||
- "-i"
|
||||
description: "Paths to the single-modality .h5mu files that need to be combined"
|
||||
info: null
|
||||
default:
|
||||
- "sample_paths"
|
||||
must_exist: true
|
||||
create_parent: true
|
||||
required: true
|
||||
direction: "input"
|
||||
multiple: true
|
||||
multiple_sep: ";"
|
||||
- type: "file"
|
||||
name: "--output"
|
||||
alternatives:
|
||||
- "-o"
|
||||
description: "Path to the output file."
|
||||
info: null
|
||||
default:
|
||||
- "output.h5mu"
|
||||
must_exist: true
|
||||
create_parent: true
|
||||
required: false
|
||||
direction: "output"
|
||||
multiple: false
|
||||
multiple_sep: ";"
|
||||
- type: "string"
|
||||
name: "--output_compression"
|
||||
description: "The compression format to be used on the output h5mu object."
|
||||
info: null
|
||||
example:
|
||||
- "gzip"
|
||||
required: false
|
||||
choices:
|
||||
- "gzip"
|
||||
- "lzf"
|
||||
direction: "input"
|
||||
multiple: false
|
||||
multiple_sep: ";"
|
||||
resources:
|
||||
- type: "python_script"
|
||||
path: "script.py"
|
||||
is_executable: true
|
||||
- type: "file"
|
||||
path: "setup_logger.py"
|
||||
- type: "file"
|
||||
path: "nextflow_labels.config"
|
||||
dest: "nextflow_labels.config"
|
||||
description: "Combine one or more single-modality .h5mu files together into one .h5mu\
|
||||
\ file.\n"
|
||||
test_resources:
|
||||
- type: "python_script"
|
||||
path: "test.py"
|
||||
is_executable: true
|
||||
- type: "file"
|
||||
path: "pbmc_1k_protein_v3_filtered_feature_bc_matrix_rna.h5mu"
|
||||
- type: "file"
|
||||
path: "pbmc_1k_protein_v3_filtered_feature_bc_matrix_prot.h5mu"
|
||||
info: null
|
||||
status: "enabled"
|
||||
scope:
|
||||
image: "public"
|
||||
target: "public"
|
||||
license: "MIT"
|
||||
links:
|
||||
repository: "https://github.com/openpipelines-bio/openpipeline"
|
||||
docker_registry: "ghcr.io"
|
||||
runners:
|
||||
- type: "executable"
|
||||
id: "executable"
|
||||
docker_setup_strategy: "ifneedbepullelsecachedbuild"
|
||||
- type: "nextflow"
|
||||
id: "nextflow"
|
||||
directives:
|
||||
label:
|
||||
- "singlecpu"
|
||||
- "highmem"
|
||||
tag: "$id"
|
||||
auto:
|
||||
simplifyInput: true
|
||||
simplifyOutput: false
|
||||
transcript: false
|
||||
publish: false
|
||||
config:
|
||||
labels:
|
||||
mem1gb: "memory = 1000000000.B"
|
||||
mem2gb: "memory = 2000000000.B"
|
||||
mem5gb: "memory = 5000000000.B"
|
||||
mem10gb: "memory = 10000000000.B"
|
||||
mem20gb: "memory = 20000000000.B"
|
||||
mem50gb: "memory = 50000000000.B"
|
||||
mem100gb: "memory = 100000000000.B"
|
||||
mem200gb: "memory = 200000000000.B"
|
||||
mem500gb: "memory = 500000000000.B"
|
||||
mem1tb: "memory = 1000000000000.B"
|
||||
mem2tb: "memory = 2000000000000.B"
|
||||
mem5tb: "memory = 5000000000000.B"
|
||||
mem10tb: "memory = 10000000000000.B"
|
||||
mem20tb: "memory = 20000000000000.B"
|
||||
mem50tb: "memory = 50000000000000.B"
|
||||
mem100tb: "memory = 100000000000000.B"
|
||||
mem200tb: "memory = 200000000000000.B"
|
||||
mem500tb: "memory = 500000000000000.B"
|
||||
mem1gib: "memory = 1073741824.B"
|
||||
mem2gib: "memory = 2147483648.B"
|
||||
mem4gib: "memory = 4294967296.B"
|
||||
mem8gib: "memory = 8589934592.B"
|
||||
mem16gib: "memory = 17179869184.B"
|
||||
mem32gib: "memory = 34359738368.B"
|
||||
mem64gib: "memory = 68719476736.B"
|
||||
mem128gib: "memory = 137438953472.B"
|
||||
mem256gib: "memory = 274877906944.B"
|
||||
mem512gib: "memory = 549755813888.B"
|
||||
mem1tib: "memory = 1099511627776.B"
|
||||
mem2tib: "memory = 2199023255552.B"
|
||||
mem4tib: "memory = 4398046511104.B"
|
||||
mem8tib: "memory = 8796093022208.B"
|
||||
mem16tib: "memory = 17592186044416.B"
|
||||
mem32tib: "memory = 35184372088832.B"
|
||||
mem64tib: "memory = 70368744177664.B"
|
||||
mem128tib: "memory = 140737488355328.B"
|
||||
mem256tib: "memory = 281474976710656.B"
|
||||
mem512tib: "memory = 562949953421312.B"
|
||||
cpu1: "cpus = 1"
|
||||
cpu2: "cpus = 2"
|
||||
cpu5: "cpus = 5"
|
||||
cpu10: "cpus = 10"
|
||||
cpu20: "cpus = 20"
|
||||
cpu50: "cpus = 50"
|
||||
cpu100: "cpus = 100"
|
||||
cpu200: "cpus = 200"
|
||||
cpu500: "cpus = 500"
|
||||
cpu1000: "cpus = 1000"
|
||||
script:
|
||||
- "includeConfig(\"nextflow_labels.config\")"
|
||||
debug: false
|
||||
container: "docker"
|
||||
engines:
|
||||
- type: "docker"
|
||||
id: "docker"
|
||||
image: "python:3.12-slim"
|
||||
target_tag: "disable-scrublet_build"
|
||||
namespace_separator: "/"
|
||||
setup:
|
||||
- type: "apt"
|
||||
packages:
|
||||
- "procps"
|
||||
interactive: false
|
||||
- type: "python"
|
||||
user: false
|
||||
packages:
|
||||
- "anndata~=0.11.1"
|
||||
- "mudata~=0.3.1"
|
||||
script:
|
||||
- "exec(\"try:\\n import awkward\\nexcept ModuleNotFoundError:\\n exit(0)\\\
|
||||
nelse: exit(1)\")"
|
||||
upgrade: true
|
||||
test_setup:
|
||||
- type: "apt"
|
||||
packages:
|
||||
- "git"
|
||||
interactive: false
|
||||
- type: "python"
|
||||
user: false
|
||||
packages:
|
||||
- "viashpy==0.8.0"
|
||||
github:
|
||||
- "openpipelines-bio/core#subdirectory=packages/python/openpipeline_testutils"
|
||||
upgrade: true
|
||||
entrypoint: []
|
||||
cmd: null
|
||||
build_info:
|
||||
config: "src/dataflow/merge/config.vsh.yml"
|
||||
runner: "nextflow"
|
||||
engine: "docker"
|
||||
output: "target/nextflow/dataflow/merge"
|
||||
executable: "target/nextflow/dataflow/merge/main.nf"
|
||||
viash_version: "0.9.4"
|
||||
git_commit: "07297b53180b28c8e198414328683e941eec7ed0"
|
||||
git_remote: "https://github.com/openpipelines-bio/openpipeline"
|
||||
git_tag: "0.2.0-2044-g07297b53180"
|
||||
package_config:
|
||||
name: "openpipeline"
|
||||
summary: "Best-practice workflows for single-cell multi-omics analyses.\n"
|
||||
description: "OpenPipelines are extensible single cell analysis pipelines for reproducible\
|
||||
\ and large-scale single cell processing using [Viash](https://viash.io) and [Nextflow](https://www.nextflow.io/).\n\
|
||||
\nIn terms of workflows, the following has been made available, but keep in mind\
|
||||
\ that\nindividual tools and functionality can be executed as standalone components\
|
||||
\ as well.\n\n * Demultiplexing: conversion of raw sequencing data to FASTQ objects.\n\
|
||||
\ * Ingestion: Read mapping and generating a count matrix.\n * Single sample\
|
||||
\ processing: cell filtering and doublet detection.\n * Multisample processing:\
|
||||
\ Count transformation, normalization, QC metric calulations.\n * Integration:\
|
||||
\ Clustering, integration and batch correction using single and multimodal methods.\n\
|
||||
\ * Downstream analysis workflows\n"
|
||||
info:
|
||||
test_resources:
|
||||
- type: "s3"
|
||||
path: "s3://openpipelines-data"
|
||||
dest: "resources_test"
|
||||
viash_version: "0.9.4"
|
||||
source: "src"
|
||||
target: "target"
|
||||
config_mods:
|
||||
- ".resources += {path: '/src/workflows/utils/labels.config', dest: 'nextflow_labels.config'}\n\
|
||||
.runners[.type == 'nextflow'].config.script := 'includeConfig(\"nextflow_labels.config\"\
|
||||
)'"
|
||||
- ".version := \"disable-scrublet_build\""
|
||||
- ".engines[.type == 'docker'].target_tag := 'disable-scrublet_build'"
|
||||
keywords:
|
||||
- "single-cell"
|
||||
- "multimodal"
|
||||
license: "MIT"
|
||||
organization: "openpipelines-bio"
|
||||
links:
|
||||
repository: "https://github.com/openpipelines-bio/openpipeline"
|
||||
docker_registry: "ghcr.io"
|
||||
homepage: "https://openpipelines.bio"
|
||||
documentation: "https://openpipelines.bio/fundamentals"
|
||||
issue_tracker: "https://github.com/openpipelines-bio/openpipeline/issues"
|
||||
File diff suppressed because it is too large
Load Diff
@@ -0,0 +1,126 @@
|
||||
manifest {
|
||||
name = 'dataflow/merge'
|
||||
mainScript = 'main.nf'
|
||||
nextflowVersion = '!>=20.12.1-edge'
|
||||
version = 'disable-scrublet_build'
|
||||
description = 'Combine one or more single-modality .h5mu files together into one .h5mu file.\n'
|
||||
author = 'Dries Schaumont'
|
||||
}
|
||||
|
||||
process.container = 'nextflow/bash:latest'
|
||||
|
||||
// detect tempdir
|
||||
tempDir = java.nio.file.Paths.get(
|
||||
System.getenv('NXF_TEMP') ?:
|
||||
System.getenv('VIASH_TEMP') ?:
|
||||
System.getenv('TEMPDIR') ?:
|
||||
System.getenv('TMPDIR') ?:
|
||||
'/tmp'
|
||||
).toAbsolutePath()
|
||||
|
||||
profiles {
|
||||
no_publish {
|
||||
process {
|
||||
withName: '.*' {
|
||||
publishDir = [
|
||||
enabled: false
|
||||
]
|
||||
}
|
||||
}
|
||||
}
|
||||
mount_temp {
|
||||
docker.temp = tempDir
|
||||
podman.temp = tempDir
|
||||
charliecloud.temp = tempDir
|
||||
}
|
||||
docker {
|
||||
docker.enabled = true
|
||||
// docker.userEmulation = true
|
||||
singularity.enabled = false
|
||||
podman.enabled = false
|
||||
shifter.enabled = false
|
||||
charliecloud.enabled = false
|
||||
}
|
||||
singularity {
|
||||
singularity.enabled = true
|
||||
singularity.autoMounts = true
|
||||
docker.enabled = false
|
||||
podman.enabled = false
|
||||
shifter.enabled = false
|
||||
charliecloud.enabled = false
|
||||
}
|
||||
podman {
|
||||
podman.enabled = true
|
||||
docker.enabled = false
|
||||
singularity.enabled = false
|
||||
shifter.enabled = false
|
||||
charliecloud.enabled = false
|
||||
}
|
||||
shifter {
|
||||
shifter.enabled = true
|
||||
docker.enabled = false
|
||||
singularity.enabled = false
|
||||
podman.enabled = false
|
||||
charliecloud.enabled = false
|
||||
}
|
||||
charliecloud {
|
||||
charliecloud.enabled = true
|
||||
docker.enabled = false
|
||||
singularity.enabled = false
|
||||
podman.enabled = false
|
||||
shifter.enabled = false
|
||||
}
|
||||
}
|
||||
|
||||
process{
|
||||
withLabel: mem1gb { memory = 1000000000.B }
|
||||
withLabel: mem2gb { memory = 2000000000.B }
|
||||
withLabel: mem5gb { memory = 5000000000.B }
|
||||
withLabel: mem10gb { memory = 10000000000.B }
|
||||
withLabel: mem20gb { memory = 20000000000.B }
|
||||
withLabel: mem50gb { memory = 50000000000.B }
|
||||
withLabel: mem100gb { memory = 100000000000.B }
|
||||
withLabel: mem200gb { memory = 200000000000.B }
|
||||
withLabel: mem500gb { memory = 500000000000.B }
|
||||
withLabel: mem1tb { memory = 1000000000000.B }
|
||||
withLabel: mem2tb { memory = 2000000000000.B }
|
||||
withLabel: mem5tb { memory = 5000000000000.B }
|
||||
withLabel: mem10tb { memory = 10000000000000.B }
|
||||
withLabel: mem20tb { memory = 20000000000000.B }
|
||||
withLabel: mem50tb { memory = 50000000000000.B }
|
||||
withLabel: mem100tb { memory = 100000000000000.B }
|
||||
withLabel: mem200tb { memory = 200000000000000.B }
|
||||
withLabel: mem500tb { memory = 500000000000000.B }
|
||||
withLabel: mem1gib { memory = 1073741824.B }
|
||||
withLabel: mem2gib { memory = 2147483648.B }
|
||||
withLabel: mem4gib { memory = 4294967296.B }
|
||||
withLabel: mem8gib { memory = 8589934592.B }
|
||||
withLabel: mem16gib { memory = 17179869184.B }
|
||||
withLabel: mem32gib { memory = 34359738368.B }
|
||||
withLabel: mem64gib { memory = 68719476736.B }
|
||||
withLabel: mem128gib { memory = 137438953472.B }
|
||||
withLabel: mem256gib { memory = 274877906944.B }
|
||||
withLabel: mem512gib { memory = 549755813888.B }
|
||||
withLabel: mem1tib { memory = 1099511627776.B }
|
||||
withLabel: mem2tib { memory = 2199023255552.B }
|
||||
withLabel: mem4tib { memory = 4398046511104.B }
|
||||
withLabel: mem8tib { memory = 8796093022208.B }
|
||||
withLabel: mem16tib { memory = 17592186044416.B }
|
||||
withLabel: mem32tib { memory = 35184372088832.B }
|
||||
withLabel: mem64tib { memory = 70368744177664.B }
|
||||
withLabel: mem128tib { memory = 140737488355328.B }
|
||||
withLabel: mem256tib { memory = 281474976710656.B }
|
||||
withLabel: mem512tib { memory = 562949953421312.B }
|
||||
withLabel: cpu1 { cpus = 1 }
|
||||
withLabel: cpu2 { cpus = 2 }
|
||||
withLabel: cpu5 { cpus = 5 }
|
||||
withLabel: cpu10 { cpus = 10 }
|
||||
withLabel: cpu20 { cpus = 20 }
|
||||
withLabel: cpu50 { cpus = 50 }
|
||||
withLabel: cpu100 { cpus = 100 }
|
||||
withLabel: cpu200 { cpus = 200 }
|
||||
withLabel: cpu500 { cpus = 500 }
|
||||
withLabel: cpu1000 { cpus = 1000 }
|
||||
}
|
||||
|
||||
includeConfig("nextflow_labels.config")
|
||||
@@ -0,0 +1,66 @@
|
||||
process {
|
||||
// Default resources for components that hardly do any processing
|
||||
memory = { 2.GB * task.attempt }
|
||||
cpus = 1
|
||||
|
||||
// Retry for exit codes that have something to do with memory issues
|
||||
errorStrategy = { task.exitStatus in 137..140 ? 'retry' : 'terminate' }
|
||||
maxRetries = 3
|
||||
maxMemory = null
|
||||
|
||||
// CPU resources
|
||||
withLabel: singlecpu { cpus = 1 }
|
||||
withLabel: lowcpu { cpus = 4 }
|
||||
withLabel: midcpu { cpus = 10 }
|
||||
withLabel: highcpu { cpus = 20 }
|
||||
|
||||
// Memory resources
|
||||
withLabel: lowmem { memory = { get_memory( 4.GB * task.attempt ) } }
|
||||
withLabel: midmem { memory = { get_memory( 25.GB * task.attempt ) } }
|
||||
withLabel: highmem { memory = { get_memory( 50.GB * task.attempt ) } }
|
||||
withLabel: veryhighmem { memory = { get_memory( 75.GB * task.attempt ) } }
|
||||
|
||||
// Disk space
|
||||
withLabel: lowdisk {
|
||||
disk = {process.disk ? process.disk : null}
|
||||
}
|
||||
withLabel: middisk {
|
||||
disk = {process.disk ? process.disk : null}
|
||||
}
|
||||
withLabel: highdisk {
|
||||
disk = {process.disk ? process.disk : null}
|
||||
}
|
||||
withLabel: veryhighdisk {
|
||||
disk = {process.disk ? process.disk : null}
|
||||
}
|
||||
|
||||
// NOTE: The above labels intentionally do not have an effect by default.
|
||||
// The user should set the disk space requirements by adding the following
|
||||
// to the compute environment:
|
||||
//
|
||||
// withLabel: lowdisk { disk = { 20.GB * task.attempt } }
|
||||
// withLabel: middisk { disk = { 100.GB * task.attempt } }
|
||||
// withLabel: highdisk { disk = { 200.GB * task.attempt } }
|
||||
// withLabel: veryhighdisk { disk = { 500.GB * task.attempt } }
|
||||
}
|
||||
|
||||
def get_memory(to_compare) {
|
||||
if (!process.containsKey("maxMemory") || !process.maxMemory) {
|
||||
return to_compare
|
||||
}
|
||||
|
||||
try {
|
||||
if (process.containsKey("maxRetries") && process.maxRetries && task.attempt == (process.maxRetries as int)) {
|
||||
return process.maxMemory
|
||||
}
|
||||
else if (to_compare.compareTo(process.maxMemory as nextflow.util.MemoryUnit) == 1) {
|
||||
return max_memory as nextflow.util.MemoryUnit
|
||||
}
|
||||
else {
|
||||
return to_compare
|
||||
}
|
||||
} catch (all) {
|
||||
println "Error processing memory resources. Please check that process.maxMemory '${process.maxMemory}' and process.maxRetries '${process.maxRetries}' are valid!"
|
||||
System.exit(1)
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,8 @@
|
||||
# Arguments
|
||||
input: # please fill in - example: ["sample_paths"]
|
||||
# output: "output.h5mu"
|
||||
# output_compression: "gzip"
|
||||
|
||||
# Nextflow input-output arguments
|
||||
publish_dir: # please fill in - example: "output/"
|
||||
# param_list: "my_params.yaml"
|
||||
@@ -0,0 +1,78 @@
|
||||
{
|
||||
"$schema": "https://json-schema.org/draft/2020-12/schema",
|
||||
"title": "merge",
|
||||
"description": "Combine one or more single-modality .h5mu files together into one .h5mu file.\n",
|
||||
"type": "object",
|
||||
"$defs": {
|
||||
"Dataset input": {
|
||||
"title": "Dataset input",
|
||||
"type": "object",
|
||||
"description": "Dataset input using nf-tower \"dataset\" or \"data explorer\". Allows for the input of multiple parameter sets to initialise a Nextflow channel.",
|
||||
"properties": {
|
||||
"param_list": {
|
||||
"description": "Dataset input can either be a list of maps, a csv file, a json file, a yaml file, or simply a yaml blob. The names of the input fields (e.g. csv columns, json keys) need to be an exact match with the workflow input parameters.",
|
||||
"type": "string",
|
||||
"default": "",
|
||||
"format": "file-path",
|
||||
"mimetype": "text/csv"
|
||||
}
|
||||
}
|
||||
},
|
||||
"arguments": {
|
||||
"title": "Arguments",
|
||||
"type": "object",
|
||||
"description": "No description",
|
||||
"properties": {
|
||||
"input": {
|
||||
"type": "array",
|
||||
"items": {
|
||||
"type": "string"
|
||||
},
|
||||
"format": "path",
|
||||
"exists": true,
|
||||
"description": "Paths to the single-modality .h5mu files that need to be combined",
|
||||
"help_text": "Type: `file`, multiple: `True`, required, default: `[\"sample_paths\"]`, direction: `input`. ",
|
||||
"default": [
|
||||
"sample_paths"
|
||||
]
|
||||
},
|
||||
"output": {
|
||||
"type": "string",
|
||||
"format": "path",
|
||||
"description": "Path to the output file.",
|
||||
"help_text": "Type: `file`, multiple: `False`, default: `\"output.h5mu\"`, direction: `output`. ",
|
||||
"default": "output.h5mu"
|
||||
},
|
||||
"output_compression": {
|
||||
"type": "string",
|
||||
"description": "The compression format to be used on the output h5mu object.",
|
||||
"help_text": "Type: `string`, multiple: `False`, example: `\"gzip\"`, choices: ``gzip`, `lzf``. ",
|
||||
"enum": [
|
||||
"gzip",
|
||||
"lzf"
|
||||
]
|
||||
}
|
||||
}
|
||||
},
|
||||
"nextflow input-output arguments": {
|
||||
"title": "Nextflow input-output arguments",
|
||||
"type": "object",
|
||||
"description": "Input/output parameters for Nextflow itself. Please note that both publishDir and publish_dir are supported but at least one has to be configured.",
|
||||
"properties": {
|
||||
"publish_dir": {
|
||||
"type": "string",
|
||||
"description": "Path to an output directory.",
|
||||
"help_text": "Type: `string`, multiple: `False`, required, example: `\"output/\"`. "
|
||||
}
|
||||
}
|
||||
}
|
||||
},
|
||||
"allOf": [
|
||||
{
|
||||
"$ref": "#/$defs/arguments"
|
||||
},
|
||||
{
|
||||
"$ref": "#/$defs/nextflow input-output arguments"
|
||||
}
|
||||
]
|
||||
}
|
||||
@@ -0,0 +1,12 @@
|
||||
def setup_logger():
|
||||
import logging
|
||||
from sys import stdout
|
||||
|
||||
logger = logging.getLogger()
|
||||
logger.setLevel(logging.INFO)
|
||||
console_handler = logging.StreamHandler(stdout)
|
||||
logFormatter = logging.Formatter("%(asctime)s %(levelname)-8s %(message)s")
|
||||
console_handler.setFormatter(logFormatter)
|
||||
logger.addHandler(console_handler)
|
||||
|
||||
return logger
|
||||
@@ -0,0 +1,269 @@
|
||||
name: "split_modalities"
|
||||
namespace: "dataflow"
|
||||
version: "disable-scrublet_build"
|
||||
authors:
|
||||
- name: "Dries Schaumont"
|
||||
roles:
|
||||
- "maintainer"
|
||||
info:
|
||||
role: "Core Team Member"
|
||||
links:
|
||||
email: "dries@data-intuitive.com"
|
||||
github: "DriesSchaumont"
|
||||
orcid: "0000-0002-4389-0440"
|
||||
linkedin: "dries-schaumont"
|
||||
organizations:
|
||||
- name: "Data Intuitive"
|
||||
href: "https://www.data-intuitive.com"
|
||||
role: "Data Scientist"
|
||||
- name: "Robrecht Cannoodt"
|
||||
roles:
|
||||
- "contributor"
|
||||
info:
|
||||
role: "Core Team Member"
|
||||
links:
|
||||
email: "robrecht@data-intuitive.com"
|
||||
github: "rcannood"
|
||||
orcid: "0000-0003-3641-729X"
|
||||
linkedin: "robrechtcannoodt"
|
||||
organizations:
|
||||
- name: "Data Intuitive"
|
||||
href: "https://www.data-intuitive.com"
|
||||
role: "Data Science Engineer"
|
||||
- name: "Open Problems"
|
||||
href: "https://openproblems.bio"
|
||||
role: "Core Member"
|
||||
argument_groups:
|
||||
- name: "Arguments"
|
||||
arguments:
|
||||
- type: "file"
|
||||
name: "--input"
|
||||
alternatives:
|
||||
- "-i"
|
||||
description: "Path to a single .h5mu file."
|
||||
info: null
|
||||
default:
|
||||
- "sample_path"
|
||||
must_exist: true
|
||||
create_parent: true
|
||||
required: true
|
||||
direction: "input"
|
||||
multiple: false
|
||||
multiple_sep: ";"
|
||||
- type: "file"
|
||||
name: "--output"
|
||||
alternatives:
|
||||
- "-o"
|
||||
description: "Output directory containing multiple h5mu files."
|
||||
info: null
|
||||
example:
|
||||
- "/path/to/output"
|
||||
must_exist: true
|
||||
create_parent: true
|
||||
required: true
|
||||
direction: "output"
|
||||
multiple: false
|
||||
multiple_sep: ";"
|
||||
- type: "file"
|
||||
name: "--output_types"
|
||||
description: "A csv containing the base filename and modality type per output\
|
||||
\ file."
|
||||
info: null
|
||||
example:
|
||||
- "types.csv"
|
||||
must_exist: true
|
||||
create_parent: true
|
||||
required: true
|
||||
direction: "output"
|
||||
multiple: false
|
||||
multiple_sep: ";"
|
||||
- type: "string"
|
||||
name: "--output_compression"
|
||||
description: "Compression format to use for the output AnnData and/or Mudata objects.\n\
|
||||
By default no compression is applied.\n"
|
||||
info: null
|
||||
example:
|
||||
- "gzip"
|
||||
required: false
|
||||
choices:
|
||||
- "gzip"
|
||||
- "lzf"
|
||||
direction: "input"
|
||||
multiple: false
|
||||
multiple_sep: ";"
|
||||
resources:
|
||||
- type: "python_script"
|
||||
path: "script.py"
|
||||
is_executable: true
|
||||
- type: "file"
|
||||
path: "setup_logger.py"
|
||||
- type: "file"
|
||||
path: "nextflow_labels.config"
|
||||
dest: "nextflow_labels.config"
|
||||
description: "Split the modalities from a single .h5mu multimodal sample into seperate\
|
||||
\ .h5mu files. \n"
|
||||
test_resources:
|
||||
- type: "python_script"
|
||||
path: "test.py"
|
||||
is_executable: true
|
||||
info: null
|
||||
status: "enabled"
|
||||
scope:
|
||||
image: "public"
|
||||
target: "public"
|
||||
license: "MIT"
|
||||
links:
|
||||
repository: "https://github.com/openpipelines-bio/openpipeline"
|
||||
docker_registry: "ghcr.io"
|
||||
runners:
|
||||
- type: "executable"
|
||||
id: "executable"
|
||||
docker_setup_strategy: "ifneedbepullelsecachedbuild"
|
||||
- type: "nextflow"
|
||||
id: "nextflow"
|
||||
directives:
|
||||
label:
|
||||
- "singlecpu"
|
||||
- "lowmem"
|
||||
tag: "$id"
|
||||
auto:
|
||||
simplifyInput: true
|
||||
simplifyOutput: false
|
||||
transcript: false
|
||||
publish: false
|
||||
config:
|
||||
labels:
|
||||
mem1gb: "memory = 1000000000.B"
|
||||
mem2gb: "memory = 2000000000.B"
|
||||
mem5gb: "memory = 5000000000.B"
|
||||
mem10gb: "memory = 10000000000.B"
|
||||
mem20gb: "memory = 20000000000.B"
|
||||
mem50gb: "memory = 50000000000.B"
|
||||
mem100gb: "memory = 100000000000.B"
|
||||
mem200gb: "memory = 200000000000.B"
|
||||
mem500gb: "memory = 500000000000.B"
|
||||
mem1tb: "memory = 1000000000000.B"
|
||||
mem2tb: "memory = 2000000000000.B"
|
||||
mem5tb: "memory = 5000000000000.B"
|
||||
mem10tb: "memory = 10000000000000.B"
|
||||
mem20tb: "memory = 20000000000000.B"
|
||||
mem50tb: "memory = 50000000000000.B"
|
||||
mem100tb: "memory = 100000000000000.B"
|
||||
mem200tb: "memory = 200000000000000.B"
|
||||
mem500tb: "memory = 500000000000000.B"
|
||||
mem1gib: "memory = 1073741824.B"
|
||||
mem2gib: "memory = 2147483648.B"
|
||||
mem4gib: "memory = 4294967296.B"
|
||||
mem8gib: "memory = 8589934592.B"
|
||||
mem16gib: "memory = 17179869184.B"
|
||||
mem32gib: "memory = 34359738368.B"
|
||||
mem64gib: "memory = 68719476736.B"
|
||||
mem128gib: "memory = 137438953472.B"
|
||||
mem256gib: "memory = 274877906944.B"
|
||||
mem512gib: "memory = 549755813888.B"
|
||||
mem1tib: "memory = 1099511627776.B"
|
||||
mem2tib: "memory = 2199023255552.B"
|
||||
mem4tib: "memory = 4398046511104.B"
|
||||
mem8tib: "memory = 8796093022208.B"
|
||||
mem16tib: "memory = 17592186044416.B"
|
||||
mem32tib: "memory = 35184372088832.B"
|
||||
mem64tib: "memory = 70368744177664.B"
|
||||
mem128tib: "memory = 140737488355328.B"
|
||||
mem256tib: "memory = 281474976710656.B"
|
||||
mem512tib: "memory = 562949953421312.B"
|
||||
cpu1: "cpus = 1"
|
||||
cpu2: "cpus = 2"
|
||||
cpu5: "cpus = 5"
|
||||
cpu10: "cpus = 10"
|
||||
cpu20: "cpus = 20"
|
||||
cpu50: "cpus = 50"
|
||||
cpu100: "cpus = 100"
|
||||
cpu200: "cpus = 200"
|
||||
cpu500: "cpus = 500"
|
||||
cpu1000: "cpus = 1000"
|
||||
script:
|
||||
- "includeConfig(\"nextflow_labels.config\")"
|
||||
debug: false
|
||||
container: "docker"
|
||||
engines:
|
||||
- type: "docker"
|
||||
id: "docker"
|
||||
image: "python:3.12-slim"
|
||||
target_tag: "disable-scrublet_build"
|
||||
namespace_separator: "/"
|
||||
setup:
|
||||
- type: "apt"
|
||||
packages:
|
||||
- "procps"
|
||||
interactive: false
|
||||
- type: "python"
|
||||
user: false
|
||||
packages:
|
||||
- "anndata~=0.11.1"
|
||||
- "mudata~=0.3.1"
|
||||
script:
|
||||
- "exec(\"try:\\n import awkward\\nexcept ModuleNotFoundError:\\n exit(0)\\\
|
||||
nelse: exit(1)\")"
|
||||
upgrade: true
|
||||
test_setup:
|
||||
- type: "apt"
|
||||
packages:
|
||||
- "git"
|
||||
interactive: false
|
||||
- type: "python"
|
||||
user: false
|
||||
packages:
|
||||
- "viashpy==0.8.0"
|
||||
github:
|
||||
- "openpipelines-bio/core#subdirectory=packages/python/openpipeline_testutils"
|
||||
upgrade: true
|
||||
entrypoint: []
|
||||
cmd: null
|
||||
build_info:
|
||||
config: "src/dataflow/split_modalities/config.vsh.yaml"
|
||||
runner: "nextflow"
|
||||
engine: "docker"
|
||||
output: "target/nextflow/dataflow/split_modalities"
|
||||
executable: "target/nextflow/dataflow/split_modalities/main.nf"
|
||||
viash_version: "0.9.4"
|
||||
git_commit: "07297b53180b28c8e198414328683e941eec7ed0"
|
||||
git_remote: "https://github.com/openpipelines-bio/openpipeline"
|
||||
git_tag: "0.2.0-2044-g07297b53180"
|
||||
package_config:
|
||||
name: "openpipeline"
|
||||
summary: "Best-practice workflows for single-cell multi-omics analyses.\n"
|
||||
description: "OpenPipelines are extensible single cell analysis pipelines for reproducible\
|
||||
\ and large-scale single cell processing using [Viash](https://viash.io) and [Nextflow](https://www.nextflow.io/).\n\
|
||||
\nIn terms of workflows, the following has been made available, but keep in mind\
|
||||
\ that\nindividual tools and functionality can be executed as standalone components\
|
||||
\ as well.\n\n * Demultiplexing: conversion of raw sequencing data to FASTQ objects.\n\
|
||||
\ * Ingestion: Read mapping and generating a count matrix.\n * Single sample\
|
||||
\ processing: cell filtering and doublet detection.\n * Multisample processing:\
|
||||
\ Count transformation, normalization, QC metric calulations.\n * Integration:\
|
||||
\ Clustering, integration and batch correction using single and multimodal methods.\n\
|
||||
\ * Downstream analysis workflows\n"
|
||||
info:
|
||||
test_resources:
|
||||
- type: "s3"
|
||||
path: "s3://openpipelines-data"
|
||||
dest: "resources_test"
|
||||
viash_version: "0.9.4"
|
||||
source: "src"
|
||||
target: "target"
|
||||
config_mods:
|
||||
- ".resources += {path: '/src/workflows/utils/labels.config', dest: 'nextflow_labels.config'}\n\
|
||||
.runners[.type == 'nextflow'].config.script := 'includeConfig(\"nextflow_labels.config\"\
|
||||
)'"
|
||||
- ".version := \"disable-scrublet_build\""
|
||||
- ".engines[.type == 'docker'].target_tag := 'disable-scrublet_build'"
|
||||
keywords:
|
||||
- "single-cell"
|
||||
- "multimodal"
|
||||
license: "MIT"
|
||||
organization: "openpipelines-bio"
|
||||
links:
|
||||
repository: "https://github.com/openpipelines-bio/openpipeline"
|
||||
docker_registry: "ghcr.io"
|
||||
homepage: "https://openpipelines.bio"
|
||||
documentation: "https://openpipelines.bio/fundamentals"
|
||||
issue_tracker: "https://github.com/openpipelines-bio/openpipeline/issues"
|
||||
File diff suppressed because it is too large
Load Diff
@@ -0,0 +1,126 @@
|
||||
manifest {
|
||||
name = 'dataflow/split_modalities'
|
||||
mainScript = 'main.nf'
|
||||
nextflowVersion = '!>=20.12.1-edge'
|
||||
version = 'disable-scrublet_build'
|
||||
description = 'Split the modalities from a single .h5mu multimodal sample into seperate .h5mu files. \n'
|
||||
author = 'Dries Schaumont, Robrecht Cannoodt'
|
||||
}
|
||||
|
||||
process.container = 'nextflow/bash:latest'
|
||||
|
||||
// detect tempdir
|
||||
tempDir = java.nio.file.Paths.get(
|
||||
System.getenv('NXF_TEMP') ?:
|
||||
System.getenv('VIASH_TEMP') ?:
|
||||
System.getenv('TEMPDIR') ?:
|
||||
System.getenv('TMPDIR') ?:
|
||||
'/tmp'
|
||||
).toAbsolutePath()
|
||||
|
||||
profiles {
|
||||
no_publish {
|
||||
process {
|
||||
withName: '.*' {
|
||||
publishDir = [
|
||||
enabled: false
|
||||
]
|
||||
}
|
||||
}
|
||||
}
|
||||
mount_temp {
|
||||
docker.temp = tempDir
|
||||
podman.temp = tempDir
|
||||
charliecloud.temp = tempDir
|
||||
}
|
||||
docker {
|
||||
docker.enabled = true
|
||||
// docker.userEmulation = true
|
||||
singularity.enabled = false
|
||||
podman.enabled = false
|
||||
shifter.enabled = false
|
||||
charliecloud.enabled = false
|
||||
}
|
||||
singularity {
|
||||
singularity.enabled = true
|
||||
singularity.autoMounts = true
|
||||
docker.enabled = false
|
||||
podman.enabled = false
|
||||
shifter.enabled = false
|
||||
charliecloud.enabled = false
|
||||
}
|
||||
podman {
|
||||
podman.enabled = true
|
||||
docker.enabled = false
|
||||
singularity.enabled = false
|
||||
shifter.enabled = false
|
||||
charliecloud.enabled = false
|
||||
}
|
||||
shifter {
|
||||
shifter.enabled = true
|
||||
docker.enabled = false
|
||||
singularity.enabled = false
|
||||
podman.enabled = false
|
||||
charliecloud.enabled = false
|
||||
}
|
||||
charliecloud {
|
||||
charliecloud.enabled = true
|
||||
docker.enabled = false
|
||||
singularity.enabled = false
|
||||
podman.enabled = false
|
||||
shifter.enabled = false
|
||||
}
|
||||
}
|
||||
|
||||
process{
|
||||
withLabel: mem1gb { memory = 1000000000.B }
|
||||
withLabel: mem2gb { memory = 2000000000.B }
|
||||
withLabel: mem5gb { memory = 5000000000.B }
|
||||
withLabel: mem10gb { memory = 10000000000.B }
|
||||
withLabel: mem20gb { memory = 20000000000.B }
|
||||
withLabel: mem50gb { memory = 50000000000.B }
|
||||
withLabel: mem100gb { memory = 100000000000.B }
|
||||
withLabel: mem200gb { memory = 200000000000.B }
|
||||
withLabel: mem500gb { memory = 500000000000.B }
|
||||
withLabel: mem1tb { memory = 1000000000000.B }
|
||||
withLabel: mem2tb { memory = 2000000000000.B }
|
||||
withLabel: mem5tb { memory = 5000000000000.B }
|
||||
withLabel: mem10tb { memory = 10000000000000.B }
|
||||
withLabel: mem20tb { memory = 20000000000000.B }
|
||||
withLabel: mem50tb { memory = 50000000000000.B }
|
||||
withLabel: mem100tb { memory = 100000000000000.B }
|
||||
withLabel: mem200tb { memory = 200000000000000.B }
|
||||
withLabel: mem500tb { memory = 500000000000000.B }
|
||||
withLabel: mem1gib { memory = 1073741824.B }
|
||||
withLabel: mem2gib { memory = 2147483648.B }
|
||||
withLabel: mem4gib { memory = 4294967296.B }
|
||||
withLabel: mem8gib { memory = 8589934592.B }
|
||||
withLabel: mem16gib { memory = 17179869184.B }
|
||||
withLabel: mem32gib { memory = 34359738368.B }
|
||||
withLabel: mem64gib { memory = 68719476736.B }
|
||||
withLabel: mem128gib { memory = 137438953472.B }
|
||||
withLabel: mem256gib { memory = 274877906944.B }
|
||||
withLabel: mem512gib { memory = 549755813888.B }
|
||||
withLabel: mem1tib { memory = 1099511627776.B }
|
||||
withLabel: mem2tib { memory = 2199023255552.B }
|
||||
withLabel: mem4tib { memory = 4398046511104.B }
|
||||
withLabel: mem8tib { memory = 8796093022208.B }
|
||||
withLabel: mem16tib { memory = 17592186044416.B }
|
||||
withLabel: mem32tib { memory = 35184372088832.B }
|
||||
withLabel: mem64tib { memory = 70368744177664.B }
|
||||
withLabel: mem128tib { memory = 140737488355328.B }
|
||||
withLabel: mem256tib { memory = 281474976710656.B }
|
||||
withLabel: mem512tib { memory = 562949953421312.B }
|
||||
withLabel: cpu1 { cpus = 1 }
|
||||
withLabel: cpu2 { cpus = 2 }
|
||||
withLabel: cpu5 { cpus = 5 }
|
||||
withLabel: cpu10 { cpus = 10 }
|
||||
withLabel: cpu20 { cpus = 20 }
|
||||
withLabel: cpu50 { cpus = 50 }
|
||||
withLabel: cpu100 { cpus = 100 }
|
||||
withLabel: cpu200 { cpus = 200 }
|
||||
withLabel: cpu500 { cpus = 500 }
|
||||
withLabel: cpu1000 { cpus = 1000 }
|
||||
}
|
||||
|
||||
includeConfig("nextflow_labels.config")
|
||||
@@ -0,0 +1,66 @@
|
||||
process {
|
||||
// Default resources for components that hardly do any processing
|
||||
memory = { 2.GB * task.attempt }
|
||||
cpus = 1
|
||||
|
||||
// Retry for exit codes that have something to do with memory issues
|
||||
errorStrategy = { task.exitStatus in 137..140 ? 'retry' : 'terminate' }
|
||||
maxRetries = 3
|
||||
maxMemory = null
|
||||
|
||||
// CPU resources
|
||||
withLabel: singlecpu { cpus = 1 }
|
||||
withLabel: lowcpu { cpus = 4 }
|
||||
withLabel: midcpu { cpus = 10 }
|
||||
withLabel: highcpu { cpus = 20 }
|
||||
|
||||
// Memory resources
|
||||
withLabel: lowmem { memory = { get_memory( 4.GB * task.attempt ) } }
|
||||
withLabel: midmem { memory = { get_memory( 25.GB * task.attempt ) } }
|
||||
withLabel: highmem { memory = { get_memory( 50.GB * task.attempt ) } }
|
||||
withLabel: veryhighmem { memory = { get_memory( 75.GB * task.attempt ) } }
|
||||
|
||||
// Disk space
|
||||
withLabel: lowdisk {
|
||||
disk = {process.disk ? process.disk : null}
|
||||
}
|
||||
withLabel: middisk {
|
||||
disk = {process.disk ? process.disk : null}
|
||||
}
|
||||
withLabel: highdisk {
|
||||
disk = {process.disk ? process.disk : null}
|
||||
}
|
||||
withLabel: veryhighdisk {
|
||||
disk = {process.disk ? process.disk : null}
|
||||
}
|
||||
|
||||
// NOTE: The above labels intentionally do not have an effect by default.
|
||||
// The user should set the disk space requirements by adding the following
|
||||
// to the compute environment:
|
||||
//
|
||||
// withLabel: lowdisk { disk = { 20.GB * task.attempt } }
|
||||
// withLabel: middisk { disk = { 100.GB * task.attempt } }
|
||||
// withLabel: highdisk { disk = { 200.GB * task.attempt } }
|
||||
// withLabel: veryhighdisk { disk = { 500.GB * task.attempt } }
|
||||
}
|
||||
|
||||
def get_memory(to_compare) {
|
||||
if (!process.containsKey("maxMemory") || !process.maxMemory) {
|
||||
return to_compare
|
||||
}
|
||||
|
||||
try {
|
||||
if (process.containsKey("maxRetries") && process.maxRetries && task.attempt == (process.maxRetries as int)) {
|
||||
return process.maxMemory
|
||||
}
|
||||
else if (to_compare.compareTo(process.maxMemory as nextflow.util.MemoryUnit) == 1) {
|
||||
return max_memory as nextflow.util.MemoryUnit
|
||||
}
|
||||
else {
|
||||
return to_compare
|
||||
}
|
||||
} catch (all) {
|
||||
println "Error processing memory resources. Please check that process.maxMemory '${process.maxMemory}' and process.maxRetries '${process.maxRetries}' are valid!"
|
||||
System.exit(1)
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,9 @@
|
||||
# Arguments
|
||||
input: # please fill in - example: "sample_path"
|
||||
# output: "$id.$key.output"
|
||||
# output_types: "$id.$key.output_types.csv"
|
||||
# output_compression: "gzip"
|
||||
|
||||
# Nextflow input-output arguments
|
||||
publish_dir: # please fill in - example: "output/"
|
||||
# param_list: "my_params.yaml"
|
||||
@@ -0,0 +1,80 @@
|
||||
{
|
||||
"$schema": "https://json-schema.org/draft/2020-12/schema",
|
||||
"title": "split_modalities",
|
||||
"description": "Split the modalities from a single .h5mu multimodal sample into seperate .h5mu files. \n",
|
||||
"type": "object",
|
||||
"$defs": {
|
||||
"Dataset input": {
|
||||
"title": "Dataset input",
|
||||
"type": "object",
|
||||
"description": "Dataset input using nf-tower \"dataset\" or \"data explorer\". Allows for the input of multiple parameter sets to initialise a Nextflow channel.",
|
||||
"properties": {
|
||||
"param_list": {
|
||||
"description": "Dataset input can either be a list of maps, a csv file, a json file, a yaml file, or simply a yaml blob. The names of the input fields (e.g. csv columns, json keys) need to be an exact match with the workflow input parameters.",
|
||||
"type": "string",
|
||||
"default": "",
|
||||
"format": "file-path",
|
||||
"mimetype": "text/csv"
|
||||
}
|
||||
}
|
||||
},
|
||||
"arguments": {
|
||||
"title": "Arguments",
|
||||
"type": "object",
|
||||
"description": "No description",
|
||||
"properties": {
|
||||
"input": {
|
||||
"type": "string",
|
||||
"format": "path",
|
||||
"exists": true,
|
||||
"description": "Path to a single .h5mu file.",
|
||||
"help_text": "Type: `file`, multiple: `False`, required, default: `\"sample_path\"`, direction: `input`. ",
|
||||
"default": "sample_path"
|
||||
},
|
||||
"output": {
|
||||
"type": "string",
|
||||
"format": "path",
|
||||
"description": "Output directory containing multiple h5mu files.",
|
||||
"help_text": "Type: `file`, multiple: `False`, required, default: `\"$id.$key.output\"`, direction: `output`, example: `\"/path/to/output\"`. ",
|
||||
"default": "$id.$key.output"
|
||||
},
|
||||
"output_types": {
|
||||
"type": "string",
|
||||
"format": "path",
|
||||
"description": "A csv containing the base filename and modality type per output file.",
|
||||
"help_text": "Type: `file`, multiple: `False`, required, default: `\"$id.$key.output_types.csv\"`, direction: `output`, example: `\"types.csv\"`. ",
|
||||
"default": "$id.$key.output_types.csv"
|
||||
},
|
||||
"output_compression": {
|
||||
"type": "string",
|
||||
"description": "Compression format to use for the output AnnData and/or Mudata objects.\nBy default no compression is applied.\n",
|
||||
"help_text": "Type: `string`, multiple: `False`, example: `\"gzip\"`, choices: ``gzip`, `lzf``. ",
|
||||
"enum": [
|
||||
"gzip",
|
||||
"lzf"
|
||||
]
|
||||
}
|
||||
}
|
||||
},
|
||||
"nextflow input-output arguments": {
|
||||
"title": "Nextflow input-output arguments",
|
||||
"type": "object",
|
||||
"description": "Input/output parameters for Nextflow itself. Please note that both publishDir and publish_dir are supported but at least one has to be configured.",
|
||||
"properties": {
|
||||
"publish_dir": {
|
||||
"type": "string",
|
||||
"description": "Path to an output directory.",
|
||||
"help_text": "Type: `string`, multiple: `False`, required, example: `\"output/\"`. "
|
||||
}
|
||||
}
|
||||
}
|
||||
},
|
||||
"allOf": [
|
||||
{
|
||||
"$ref": "#/$defs/arguments"
|
||||
},
|
||||
{
|
||||
"$ref": "#/$defs/nextflow input-output arguments"
|
||||
}
|
||||
]
|
||||
}
|
||||
@@ -0,0 +1,12 @@
|
||||
def setup_logger():
|
||||
import logging
|
||||
from sys import stdout
|
||||
|
||||
logger = logging.getLogger()
|
||||
logger.setLevel(logging.INFO)
|
||||
console_handler = logging.StreamHandler(stdout)
|
||||
logFormatter = logging.Formatter("%(asctime)s %(levelname)-8s %(message)s")
|
||||
console_handler.setFormatter(logFormatter)
|
||||
logger.addHandler(console_handler)
|
||||
|
||||
return logger
|
||||
@@ -0,0 +1,314 @@
|
||||
name: "pca"
|
||||
namespace: "dimred"
|
||||
version: "disable-scrublet_build"
|
||||
authors:
|
||||
- name: "Dries De Maeyer"
|
||||
roles:
|
||||
- "maintainer"
|
||||
info:
|
||||
role: "Core Team Member"
|
||||
links:
|
||||
email: "ddemaeyer@gmail.com"
|
||||
github: "ddemaeyer"
|
||||
linkedin: "dries-de-maeyer-b46a814"
|
||||
organizations:
|
||||
- name: "Janssen Pharmaceuticals"
|
||||
href: "https://www.janssen.com"
|
||||
role: "Principal Scientist"
|
||||
argument_groups:
|
||||
- name: "Arguments"
|
||||
arguments:
|
||||
- type: "file"
|
||||
name: "--input"
|
||||
alternatives:
|
||||
- "-i"
|
||||
description: "Input h5mu file"
|
||||
info: null
|
||||
example:
|
||||
- "input.h5mu"
|
||||
must_exist: true
|
||||
create_parent: true
|
||||
required: true
|
||||
direction: "input"
|
||||
multiple: false
|
||||
multiple_sep: ";"
|
||||
- type: "string"
|
||||
name: "--modality"
|
||||
description: "Which modality from the input MuData file to process.\n"
|
||||
info: null
|
||||
default:
|
||||
- "rna"
|
||||
required: false
|
||||
direction: "input"
|
||||
multiple: false
|
||||
multiple_sep: ";"
|
||||
- type: "string"
|
||||
name: "--layer"
|
||||
description: "Use specified layer for expression values instead of the .X object\
|
||||
\ from the modality."
|
||||
info: null
|
||||
required: false
|
||||
direction: "input"
|
||||
multiple: false
|
||||
multiple_sep: ";"
|
||||
- type: "string"
|
||||
name: "--var_input"
|
||||
description: "Column name in .var matrix that will be used to select which genes\
|
||||
\ to run the PCA on."
|
||||
info: null
|
||||
example:
|
||||
- "filter_with_hvg"
|
||||
required: false
|
||||
direction: "input"
|
||||
multiple: false
|
||||
multiple_sep: ";"
|
||||
- type: "file"
|
||||
name: "--output"
|
||||
alternatives:
|
||||
- "-o"
|
||||
description: "Output h5mu file."
|
||||
info: null
|
||||
example:
|
||||
- "output.h5mu"
|
||||
must_exist: true
|
||||
create_parent: true
|
||||
required: true
|
||||
direction: "output"
|
||||
multiple: false
|
||||
multiple_sep: ";"
|
||||
- type: "string"
|
||||
name: "--obsm_output"
|
||||
description: "In which .obsm slot to store the resulting embedding."
|
||||
info: null
|
||||
default:
|
||||
- "X_pca"
|
||||
required: false
|
||||
direction: "input"
|
||||
multiple: false
|
||||
multiple_sep: ";"
|
||||
- type: "string"
|
||||
name: "--varm_output"
|
||||
description: "In which .varm slot to store the resulting loadings matrix."
|
||||
info: null
|
||||
default:
|
||||
- "pca_loadings"
|
||||
required: false
|
||||
direction: "input"
|
||||
multiple: false
|
||||
multiple_sep: ";"
|
||||
- type: "string"
|
||||
name: "--uns_output"
|
||||
description: "In which .uns slot to store the resulting variance objects."
|
||||
info: null
|
||||
default:
|
||||
- "pca_variance"
|
||||
required: false
|
||||
direction: "input"
|
||||
multiple: false
|
||||
multiple_sep: ";"
|
||||
- type: "integer"
|
||||
name: "--num_components"
|
||||
description: "Number of principal components to compute. Defaults to 50, or 1\
|
||||
\ - minimum dimension size of selected representation."
|
||||
info: null
|
||||
example:
|
||||
- 25
|
||||
required: false
|
||||
direction: "input"
|
||||
multiple: false
|
||||
multiple_sep: ";"
|
||||
- type: "boolean_true"
|
||||
name: "--overwrite"
|
||||
description: "Allow overwriting .obsm, .varm and .uns slots."
|
||||
info: null
|
||||
direction: "input"
|
||||
- type: "string"
|
||||
name: "--output_compression"
|
||||
description: "Compression format to use for the output AnnData and/or Mudata objects.\n\
|
||||
By default no compression is applied.\n"
|
||||
info: null
|
||||
example:
|
||||
- "gzip"
|
||||
required: false
|
||||
choices:
|
||||
- "gzip"
|
||||
- "lzf"
|
||||
direction: "input"
|
||||
multiple: false
|
||||
multiple_sep: ";"
|
||||
resources:
|
||||
- type: "python_script"
|
||||
path: "script.py"
|
||||
is_executable: true
|
||||
- type: "file"
|
||||
path: "setup_logger.py"
|
||||
- type: "file"
|
||||
path: "compress_h5mu.py"
|
||||
- type: "file"
|
||||
path: "nextflow_labels.config"
|
||||
dest: "nextflow_labels.config"
|
||||
description: "Computes PCA coordinates, loadings and variance decomposition. Uses\
|
||||
\ the implementation of scikit-learn [Pedregosa11].\n"
|
||||
test_resources:
|
||||
- type: "python_script"
|
||||
path: "test.py"
|
||||
is_executable: true
|
||||
- type: "file"
|
||||
path: "pbmc_1k_protein_v3"
|
||||
info: null
|
||||
status: "enabled"
|
||||
scope:
|
||||
image: "public"
|
||||
target: "public"
|
||||
license: "MIT"
|
||||
links:
|
||||
repository: "https://github.com/openpipelines-bio/openpipeline"
|
||||
docker_registry: "ghcr.io"
|
||||
runners:
|
||||
- type: "executable"
|
||||
id: "executable"
|
||||
docker_setup_strategy: "ifneedbepullelsecachedbuild"
|
||||
- type: "nextflow"
|
||||
id: "nextflow"
|
||||
directives:
|
||||
label:
|
||||
- "highcpu"
|
||||
- "highmem"
|
||||
- "middisk"
|
||||
tag: "$id"
|
||||
auto:
|
||||
simplifyInput: true
|
||||
simplifyOutput: false
|
||||
transcript: false
|
||||
publish: false
|
||||
config:
|
||||
labels:
|
||||
mem1gb: "memory = 1000000000.B"
|
||||
mem2gb: "memory = 2000000000.B"
|
||||
mem5gb: "memory = 5000000000.B"
|
||||
mem10gb: "memory = 10000000000.B"
|
||||
mem20gb: "memory = 20000000000.B"
|
||||
mem50gb: "memory = 50000000000.B"
|
||||
mem100gb: "memory = 100000000000.B"
|
||||
mem200gb: "memory = 200000000000.B"
|
||||
mem500gb: "memory = 500000000000.B"
|
||||
mem1tb: "memory = 1000000000000.B"
|
||||
mem2tb: "memory = 2000000000000.B"
|
||||
mem5tb: "memory = 5000000000000.B"
|
||||
mem10tb: "memory = 10000000000000.B"
|
||||
mem20tb: "memory = 20000000000000.B"
|
||||
mem50tb: "memory = 50000000000000.B"
|
||||
mem100tb: "memory = 100000000000000.B"
|
||||
mem200tb: "memory = 200000000000000.B"
|
||||
mem500tb: "memory = 500000000000000.B"
|
||||
mem1gib: "memory = 1073741824.B"
|
||||
mem2gib: "memory = 2147483648.B"
|
||||
mem4gib: "memory = 4294967296.B"
|
||||
mem8gib: "memory = 8589934592.B"
|
||||
mem16gib: "memory = 17179869184.B"
|
||||
mem32gib: "memory = 34359738368.B"
|
||||
mem64gib: "memory = 68719476736.B"
|
||||
mem128gib: "memory = 137438953472.B"
|
||||
mem256gib: "memory = 274877906944.B"
|
||||
mem512gib: "memory = 549755813888.B"
|
||||
mem1tib: "memory = 1099511627776.B"
|
||||
mem2tib: "memory = 2199023255552.B"
|
||||
mem4tib: "memory = 4398046511104.B"
|
||||
mem8tib: "memory = 8796093022208.B"
|
||||
mem16tib: "memory = 17592186044416.B"
|
||||
mem32tib: "memory = 35184372088832.B"
|
||||
mem64tib: "memory = 70368744177664.B"
|
||||
mem128tib: "memory = 140737488355328.B"
|
||||
mem256tib: "memory = 281474976710656.B"
|
||||
mem512tib: "memory = 562949953421312.B"
|
||||
cpu1: "cpus = 1"
|
||||
cpu2: "cpus = 2"
|
||||
cpu5: "cpus = 5"
|
||||
cpu10: "cpus = 10"
|
||||
cpu20: "cpus = 20"
|
||||
cpu50: "cpus = 50"
|
||||
cpu100: "cpus = 100"
|
||||
cpu200: "cpus = 200"
|
||||
cpu500: "cpus = 500"
|
||||
cpu1000: "cpus = 1000"
|
||||
script:
|
||||
- "includeConfig(\"nextflow_labels.config\")"
|
||||
debug: false
|
||||
container: "docker"
|
||||
engines:
|
||||
- type: "docker"
|
||||
id: "docker"
|
||||
image: "python:3.12-slim"
|
||||
target_tag: "disable-scrublet_build"
|
||||
namespace_separator: "/"
|
||||
setup:
|
||||
- type: "apt"
|
||||
packages:
|
||||
- "procps"
|
||||
interactive: false
|
||||
- type: "python"
|
||||
user: false
|
||||
packages:
|
||||
- "anndata~=0.11.1"
|
||||
- "mudata~=0.3.1"
|
||||
- "scanpy~=1.10.4"
|
||||
script:
|
||||
- "exec(\"try:\\n import awkward\\nexcept ModuleNotFoundError:\\n exit(0)\\\
|
||||
nelse: exit(1)\")"
|
||||
upgrade: true
|
||||
test_setup:
|
||||
- type: "python"
|
||||
user: false
|
||||
packages:
|
||||
- "viashpy==0.8.0"
|
||||
upgrade: true
|
||||
entrypoint: []
|
||||
cmd: null
|
||||
build_info:
|
||||
config: "src/dimred/pca/config.vsh.yaml"
|
||||
runner: "nextflow"
|
||||
engine: "docker"
|
||||
output: "target/nextflow/dimred/pca"
|
||||
executable: "target/nextflow/dimred/pca/main.nf"
|
||||
viash_version: "0.9.4"
|
||||
git_commit: "07297b53180b28c8e198414328683e941eec7ed0"
|
||||
git_remote: "https://github.com/openpipelines-bio/openpipeline"
|
||||
git_tag: "0.2.0-2044-g07297b53180"
|
||||
package_config:
|
||||
name: "openpipeline"
|
||||
summary: "Best-practice workflows for single-cell multi-omics analyses.\n"
|
||||
description: "OpenPipelines are extensible single cell analysis pipelines for reproducible\
|
||||
\ and large-scale single cell processing using [Viash](https://viash.io) and [Nextflow](https://www.nextflow.io/).\n\
|
||||
\nIn terms of workflows, the following has been made available, but keep in mind\
|
||||
\ that\nindividual tools and functionality can be executed as standalone components\
|
||||
\ as well.\n\n * Demultiplexing: conversion of raw sequencing data to FASTQ objects.\n\
|
||||
\ * Ingestion: Read mapping and generating a count matrix.\n * Single sample\
|
||||
\ processing: cell filtering and doublet detection.\n * Multisample processing:\
|
||||
\ Count transformation, normalization, QC metric calulations.\n * Integration:\
|
||||
\ Clustering, integration and batch correction using single and multimodal methods.\n\
|
||||
\ * Downstream analysis workflows\n"
|
||||
info:
|
||||
test_resources:
|
||||
- type: "s3"
|
||||
path: "s3://openpipelines-data"
|
||||
dest: "resources_test"
|
||||
viash_version: "0.9.4"
|
||||
source: "src"
|
||||
target: "target"
|
||||
config_mods:
|
||||
- ".resources += {path: '/src/workflows/utils/labels.config', dest: 'nextflow_labels.config'}\n\
|
||||
.runners[.type == 'nextflow'].config.script := 'includeConfig(\"nextflow_labels.config\"\
|
||||
)'"
|
||||
- ".version := \"disable-scrublet_build\""
|
||||
- ".engines[.type == 'docker'].target_tag := 'disable-scrublet_build'"
|
||||
keywords:
|
||||
- "single-cell"
|
||||
- "multimodal"
|
||||
license: "MIT"
|
||||
organization: "openpipelines-bio"
|
||||
links:
|
||||
repository: "https://github.com/openpipelines-bio/openpipeline"
|
||||
docker_registry: "ghcr.io"
|
||||
homepage: "https://openpipelines.bio"
|
||||
documentation: "https://openpipelines.bio/fundamentals"
|
||||
issue_tracker: "https://github.com/openpipelines-bio/openpipeline/issues"
|
||||
@@ -0,0 +1,87 @@
|
||||
import shutil
|
||||
from anndata import AnnData
|
||||
from mudata import write_h5ad
|
||||
from h5py import File as H5File
|
||||
from h5py import Group, Dataset
|
||||
from pathlib import Path
|
||||
from typing import Union, Literal
|
||||
from functools import partial
|
||||
|
||||
|
||||
def compress_h5mu(
|
||||
input_path: Union[str, Path],
|
||||
output_path: Union[str, Path],
|
||||
compression: Union[Literal["gzip"], Literal["lzf"]],
|
||||
):
|
||||
input_path, output_path = str(input_path), str(output_path)
|
||||
|
||||
def copy_attributes(in_object, out_object):
|
||||
for key, value in in_object.attrs.items():
|
||||
out_object.attrs[key] = value
|
||||
|
||||
def visit_path(
|
||||
output_h5: H5File,
|
||||
compression: Union[Literal["gzip"], Literal["lzf"]],
|
||||
name: str,
|
||||
object: Union[Group, Dataset],
|
||||
):
|
||||
if isinstance(object, Group):
|
||||
new_group = output_h5.create_group(name)
|
||||
copy_attributes(object, new_group)
|
||||
elif isinstance(object, Dataset):
|
||||
# Compression only works for non-scalar Dataset objects
|
||||
# Scalar objects dont have a shape defined
|
||||
if not object.compression and object.shape not in [None, ()]:
|
||||
new_dataset = output_h5.create_dataset(
|
||||
name, data=object, compression=compression
|
||||
)
|
||||
copy_attributes(object, new_dataset)
|
||||
else:
|
||||
output_h5.copy(object, name)
|
||||
else:
|
||||
raise NotImplementedError(
|
||||
f"Could not copy element {name}, "
|
||||
f"type has not been implemented yet: {type(object)}"
|
||||
)
|
||||
|
||||
with (
|
||||
H5File(input_path, "r") as input_h5,
|
||||
H5File(output_path, "w", userblock_size=512) as output_h5,
|
||||
):
|
||||
copy_attributes(input_h5, output_h5)
|
||||
input_h5.visititems(partial(visit_path, output_h5, compression))
|
||||
|
||||
with open(input_path, "rb") as input_bytes:
|
||||
# Mudata puts metadata like this in the first 512 bytes:
|
||||
# MuData (format-version=0.1.0;creator=muon;creator-version=0.2.0)
|
||||
# See mudata/_core/io.py, read_h5mu() function
|
||||
starting_metadata = input_bytes.read(100)
|
||||
# The metadata is padded with extra null bytes up until 512 bytes
|
||||
truncate_location = starting_metadata.find(b"\x00")
|
||||
starting_metadata = starting_metadata[:truncate_location]
|
||||
with open(output_path, "br+") as f:
|
||||
nbytes = f.write(starting_metadata)
|
||||
f.write(b"\0" * (512 - nbytes))
|
||||
|
||||
|
||||
def write_h5ad_to_h5mu_with_compression(
|
||||
output_file: Union[str, Path],
|
||||
h5mu: Union[str, Path],
|
||||
modality_name: str,
|
||||
modality_data: AnnData,
|
||||
output_compression=None,
|
||||
):
|
||||
output_file = Path(output_file)
|
||||
h5mu = Path(h5mu)
|
||||
output_file_uncompressed = (
|
||||
output_file.with_name(output_file.stem + "_uncompressed.h5mu")
|
||||
if output_compression
|
||||
else output_file
|
||||
)
|
||||
shutil.copyfile(h5mu, output_file_uncompressed)
|
||||
write_h5ad(filename=output_file_uncompressed, mod=modality_name, data=modality_data)
|
||||
if output_compression:
|
||||
compress_h5mu(
|
||||
output_file_uncompressed, output_file, compression=output_compression
|
||||
)
|
||||
output_file_uncompressed.unlink()
|
||||
File diff suppressed because it is too large
Load Diff
@@ -0,0 +1,126 @@
|
||||
manifest {
|
||||
name = 'dimred/pca'
|
||||
mainScript = 'main.nf'
|
||||
nextflowVersion = '!>=20.12.1-edge'
|
||||
version = 'disable-scrublet_build'
|
||||
description = 'Computes PCA coordinates, loadings and variance decomposition. Uses the implementation of scikit-learn [Pedregosa11].\n'
|
||||
author = 'Dries De Maeyer'
|
||||
}
|
||||
|
||||
process.container = 'nextflow/bash:latest'
|
||||
|
||||
// detect tempdir
|
||||
tempDir = java.nio.file.Paths.get(
|
||||
System.getenv('NXF_TEMP') ?:
|
||||
System.getenv('VIASH_TEMP') ?:
|
||||
System.getenv('TEMPDIR') ?:
|
||||
System.getenv('TMPDIR') ?:
|
||||
'/tmp'
|
||||
).toAbsolutePath()
|
||||
|
||||
profiles {
|
||||
no_publish {
|
||||
process {
|
||||
withName: '.*' {
|
||||
publishDir = [
|
||||
enabled: false
|
||||
]
|
||||
}
|
||||
}
|
||||
}
|
||||
mount_temp {
|
||||
docker.temp = tempDir
|
||||
podman.temp = tempDir
|
||||
charliecloud.temp = tempDir
|
||||
}
|
||||
docker {
|
||||
docker.enabled = true
|
||||
// docker.userEmulation = true
|
||||
singularity.enabled = false
|
||||
podman.enabled = false
|
||||
shifter.enabled = false
|
||||
charliecloud.enabled = false
|
||||
}
|
||||
singularity {
|
||||
singularity.enabled = true
|
||||
singularity.autoMounts = true
|
||||
docker.enabled = false
|
||||
podman.enabled = false
|
||||
shifter.enabled = false
|
||||
charliecloud.enabled = false
|
||||
}
|
||||
podman {
|
||||
podman.enabled = true
|
||||
docker.enabled = false
|
||||
singularity.enabled = false
|
||||
shifter.enabled = false
|
||||
charliecloud.enabled = false
|
||||
}
|
||||
shifter {
|
||||
shifter.enabled = true
|
||||
docker.enabled = false
|
||||
singularity.enabled = false
|
||||
podman.enabled = false
|
||||
charliecloud.enabled = false
|
||||
}
|
||||
charliecloud {
|
||||
charliecloud.enabled = true
|
||||
docker.enabled = false
|
||||
singularity.enabled = false
|
||||
podman.enabled = false
|
||||
shifter.enabled = false
|
||||
}
|
||||
}
|
||||
|
||||
process{
|
||||
withLabel: mem1gb { memory = 1000000000.B }
|
||||
withLabel: mem2gb { memory = 2000000000.B }
|
||||
withLabel: mem5gb { memory = 5000000000.B }
|
||||
withLabel: mem10gb { memory = 10000000000.B }
|
||||
withLabel: mem20gb { memory = 20000000000.B }
|
||||
withLabel: mem50gb { memory = 50000000000.B }
|
||||
withLabel: mem100gb { memory = 100000000000.B }
|
||||
withLabel: mem200gb { memory = 200000000000.B }
|
||||
withLabel: mem500gb { memory = 500000000000.B }
|
||||
withLabel: mem1tb { memory = 1000000000000.B }
|
||||
withLabel: mem2tb { memory = 2000000000000.B }
|
||||
withLabel: mem5tb { memory = 5000000000000.B }
|
||||
withLabel: mem10tb { memory = 10000000000000.B }
|
||||
withLabel: mem20tb { memory = 20000000000000.B }
|
||||
withLabel: mem50tb { memory = 50000000000000.B }
|
||||
withLabel: mem100tb { memory = 100000000000000.B }
|
||||
withLabel: mem200tb { memory = 200000000000000.B }
|
||||
withLabel: mem500tb { memory = 500000000000000.B }
|
||||
withLabel: mem1gib { memory = 1073741824.B }
|
||||
withLabel: mem2gib { memory = 2147483648.B }
|
||||
withLabel: mem4gib { memory = 4294967296.B }
|
||||
withLabel: mem8gib { memory = 8589934592.B }
|
||||
withLabel: mem16gib { memory = 17179869184.B }
|
||||
withLabel: mem32gib { memory = 34359738368.B }
|
||||
withLabel: mem64gib { memory = 68719476736.B }
|
||||
withLabel: mem128gib { memory = 137438953472.B }
|
||||
withLabel: mem256gib { memory = 274877906944.B }
|
||||
withLabel: mem512gib { memory = 549755813888.B }
|
||||
withLabel: mem1tib { memory = 1099511627776.B }
|
||||
withLabel: mem2tib { memory = 2199023255552.B }
|
||||
withLabel: mem4tib { memory = 4398046511104.B }
|
||||
withLabel: mem8tib { memory = 8796093022208.B }
|
||||
withLabel: mem16tib { memory = 17592186044416.B }
|
||||
withLabel: mem32tib { memory = 35184372088832.B }
|
||||
withLabel: mem64tib { memory = 70368744177664.B }
|
||||
withLabel: mem128tib { memory = 140737488355328.B }
|
||||
withLabel: mem256tib { memory = 281474976710656.B }
|
||||
withLabel: mem512tib { memory = 562949953421312.B }
|
||||
withLabel: cpu1 { cpus = 1 }
|
||||
withLabel: cpu2 { cpus = 2 }
|
||||
withLabel: cpu5 { cpus = 5 }
|
||||
withLabel: cpu10 { cpus = 10 }
|
||||
withLabel: cpu20 { cpus = 20 }
|
||||
withLabel: cpu50 { cpus = 50 }
|
||||
withLabel: cpu100 { cpus = 100 }
|
||||
withLabel: cpu200 { cpus = 200 }
|
||||
withLabel: cpu500 { cpus = 500 }
|
||||
withLabel: cpu1000 { cpus = 1000 }
|
||||
}
|
||||
|
||||
includeConfig("nextflow_labels.config")
|
||||
@@ -0,0 +1,66 @@
|
||||
process {
|
||||
// Default resources for components that hardly do any processing
|
||||
memory = { 2.GB * task.attempt }
|
||||
cpus = 1
|
||||
|
||||
// Retry for exit codes that have something to do with memory issues
|
||||
errorStrategy = { task.exitStatus in 137..140 ? 'retry' : 'terminate' }
|
||||
maxRetries = 3
|
||||
maxMemory = null
|
||||
|
||||
// CPU resources
|
||||
withLabel: singlecpu { cpus = 1 }
|
||||
withLabel: lowcpu { cpus = 4 }
|
||||
withLabel: midcpu { cpus = 10 }
|
||||
withLabel: highcpu { cpus = 20 }
|
||||
|
||||
// Memory resources
|
||||
withLabel: lowmem { memory = { get_memory( 4.GB * task.attempt ) } }
|
||||
withLabel: midmem { memory = { get_memory( 25.GB * task.attempt ) } }
|
||||
withLabel: highmem { memory = { get_memory( 50.GB * task.attempt ) } }
|
||||
withLabel: veryhighmem { memory = { get_memory( 75.GB * task.attempt ) } }
|
||||
|
||||
// Disk space
|
||||
withLabel: lowdisk {
|
||||
disk = {process.disk ? process.disk : null}
|
||||
}
|
||||
withLabel: middisk {
|
||||
disk = {process.disk ? process.disk : null}
|
||||
}
|
||||
withLabel: highdisk {
|
||||
disk = {process.disk ? process.disk : null}
|
||||
}
|
||||
withLabel: veryhighdisk {
|
||||
disk = {process.disk ? process.disk : null}
|
||||
}
|
||||
|
||||
// NOTE: The above labels intentionally do not have an effect by default.
|
||||
// The user should set the disk space requirements by adding the following
|
||||
// to the compute environment:
|
||||
//
|
||||
// withLabel: lowdisk { disk = { 20.GB * task.attempt } }
|
||||
// withLabel: middisk { disk = { 100.GB * task.attempt } }
|
||||
// withLabel: highdisk { disk = { 200.GB * task.attempt } }
|
||||
// withLabel: veryhighdisk { disk = { 500.GB * task.attempt } }
|
||||
}
|
||||
|
||||
def get_memory(to_compare) {
|
||||
if (!process.containsKey("maxMemory") || !process.maxMemory) {
|
||||
return to_compare
|
||||
}
|
||||
|
||||
try {
|
||||
if (process.containsKey("maxRetries") && process.maxRetries && task.attempt == (process.maxRetries as int)) {
|
||||
return process.maxMemory
|
||||
}
|
||||
else if (to_compare.compareTo(process.maxMemory as nextflow.util.MemoryUnit) == 1) {
|
||||
return max_memory as nextflow.util.MemoryUnit
|
||||
}
|
||||
else {
|
||||
return to_compare
|
||||
}
|
||||
} catch (all) {
|
||||
println "Error processing memory resources. Please check that process.maxMemory '${process.maxMemory}' and process.maxRetries '${process.maxRetries}' are valid!"
|
||||
System.exit(1)
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,16 @@
|
||||
# Arguments
|
||||
input: # please fill in - example: "input.h5mu"
|
||||
modality: "rna"
|
||||
# layer: "foo"
|
||||
# var_input: "filter_with_hvg"
|
||||
# output: "$id.$key.output.h5mu"
|
||||
obsm_output: "X_pca"
|
||||
varm_output: "pca_loadings"
|
||||
uns_output: "pca_variance"
|
||||
# num_components: 25
|
||||
overwrite: false
|
||||
# output_compression: "gzip"
|
||||
|
||||
# Nextflow input-output arguments
|
||||
publish_dir: # please fill in - example: "output/"
|
||||
# param_list: "my_params.yaml"
|
||||
@@ -0,0 +1,117 @@
|
||||
{
|
||||
"$schema": "https://json-schema.org/draft/2020-12/schema",
|
||||
"title": "pca",
|
||||
"description": "Computes PCA coordinates, loadings and variance decomposition. Uses the implementation of scikit-learn [Pedregosa11].\n",
|
||||
"type": "object",
|
||||
"$defs": {
|
||||
"Dataset input": {
|
||||
"title": "Dataset input",
|
||||
"type": "object",
|
||||
"description": "Dataset input using nf-tower \"dataset\" or \"data explorer\". Allows for the input of multiple parameter sets to initialise a Nextflow channel.",
|
||||
"properties": {
|
||||
"param_list": {
|
||||
"description": "Dataset input can either be a list of maps, a csv file, a json file, a yaml file, or simply a yaml blob. The names of the input fields (e.g. csv columns, json keys) need to be an exact match with the workflow input parameters.",
|
||||
"type": "string",
|
||||
"default": "",
|
||||
"format": "file-path",
|
||||
"mimetype": "text/csv"
|
||||
}
|
||||
}
|
||||
},
|
||||
"arguments": {
|
||||
"title": "Arguments",
|
||||
"type": "object",
|
||||
"description": "No description",
|
||||
"properties": {
|
||||
"input": {
|
||||
"type": "string",
|
||||
"format": "path",
|
||||
"exists": true,
|
||||
"description": "Input h5mu file",
|
||||
"help_text": "Type: `file`, multiple: `False`, required, direction: `input`, example: `\"input.h5mu\"`. "
|
||||
},
|
||||
"modality": {
|
||||
"type": "string",
|
||||
"description": "Which modality from the input MuData file to process.\n",
|
||||
"help_text": "Type: `string`, multiple: `False`, default: `\"rna\"`. ",
|
||||
"default": "rna"
|
||||
},
|
||||
"layer": {
|
||||
"type": "string",
|
||||
"description": "Use specified layer for expression values instead of the .X object from the modality.",
|
||||
"help_text": "Type: `string`, multiple: `False`. "
|
||||
},
|
||||
"var_input": {
|
||||
"type": "string",
|
||||
"description": "Column name in .var matrix that will be used to select which genes to run the PCA on.",
|
||||
"help_text": "Type: `string`, multiple: `False`, example: `\"filter_with_hvg\"`. "
|
||||
},
|
||||
"output": {
|
||||
"type": "string",
|
||||
"format": "path",
|
||||
"description": "Output h5mu file.",
|
||||
"help_text": "Type: `file`, multiple: `False`, required, default: `\"$id.$key.output.h5mu\"`, direction: `output`, example: `\"output.h5mu\"`. ",
|
||||
"default": "$id.$key.output.h5mu"
|
||||
},
|
||||
"obsm_output": {
|
||||
"type": "string",
|
||||
"description": "In which .obsm slot to store the resulting embedding.",
|
||||
"help_text": "Type: `string`, multiple: `False`, default: `\"X_pca\"`. ",
|
||||
"default": "X_pca"
|
||||
},
|
||||
"varm_output": {
|
||||
"type": "string",
|
||||
"description": "In which .varm slot to store the resulting loadings matrix.",
|
||||
"help_text": "Type: `string`, multiple: `False`, default: `\"pca_loadings\"`. ",
|
||||
"default": "pca_loadings"
|
||||
},
|
||||
"uns_output": {
|
||||
"type": "string",
|
||||
"description": "In which .uns slot to store the resulting variance objects.",
|
||||
"help_text": "Type: `string`, multiple: `False`, default: `\"pca_variance\"`. ",
|
||||
"default": "pca_variance"
|
||||
},
|
||||
"num_components": {
|
||||
"type": "integer",
|
||||
"description": "Number of principal components to compute",
|
||||
"help_text": "Type: `integer`, multiple: `False`, example: `25`. "
|
||||
},
|
||||
"overwrite": {
|
||||
"type": "boolean",
|
||||
"description": "Allow overwriting .obsm, .varm and .uns slots.",
|
||||
"help_text": "Type: `boolean_true`, multiple: `False`, default: `false`. ",
|
||||
"default": false
|
||||
},
|
||||
"output_compression": {
|
||||
"type": "string",
|
||||
"description": "Compression format to use for the output AnnData and/or Mudata objects.\nBy default no compression is applied.\n",
|
||||
"help_text": "Type: `string`, multiple: `False`, example: `\"gzip\"`, choices: ``gzip`, `lzf``. ",
|
||||
"enum": [
|
||||
"gzip",
|
||||
"lzf"
|
||||
]
|
||||
}
|
||||
}
|
||||
},
|
||||
"nextflow input-output arguments": {
|
||||
"title": "Nextflow input-output arguments",
|
||||
"type": "object",
|
||||
"description": "Input/output parameters for Nextflow itself. Please note that both publishDir and publish_dir are supported but at least one has to be configured.",
|
||||
"properties": {
|
||||
"publish_dir": {
|
||||
"type": "string",
|
||||
"description": "Path to an output directory.",
|
||||
"help_text": "Type: `string`, multiple: `False`, required, example: `\"output/\"`. "
|
||||
}
|
||||
}
|
||||
}
|
||||
},
|
||||
"allOf": [
|
||||
{
|
||||
"$ref": "#/$defs/arguments"
|
||||
},
|
||||
{
|
||||
"$ref": "#/$defs/nextflow input-output arguments"
|
||||
}
|
||||
]
|
||||
}
|
||||
@@ -0,0 +1,12 @@
|
||||
def setup_logger():
|
||||
import logging
|
||||
from sys import stdout
|
||||
|
||||
logger = logging.getLogger()
|
||||
logger.setLevel(logging.INFO)
|
||||
console_handler = logging.StreamHandler(stdout)
|
||||
logFormatter = logging.Formatter("%(asctime)s %(levelname)-8s %(message)s")
|
||||
console_handler.setFormatter(logFormatter)
|
||||
logger.addHandler(console_handler)
|
||||
|
||||
return logger
|
||||
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user