Build branch remove-unreleased-dependencies with version remove-unreleased-dependencies (2948b94)

Build pipeline: openpipelines-bio.openpipeline-spatial.remove-unreleased-d9cfkp

Source commit: 2948b940db

Source message: remove unreleased dependencies
This commit is contained in:
CI
2025-08-22 08:38:44 +00:00
commit f396f5eda7
463 changed files with 238009 additions and 0 deletions

58
.gitignore vendored Normal file
View File

@@ -0,0 +1,58 @@
# IDEs and editors
/.idea
.project
.classpath
*.launch
.settings/
.vscode
# Temp
gitignore
test_results
# System Files
.DS_Store
Thumbs.db
# Nextflow
work
.nextflow*
# viash
check_results/
out/
output*
output_log/
resources_test
/viash_tools/
/test/
# jupyter notebook
/.ipynb_checkpoints/
*.ipynb
# compress
/__MACOSX/
# python
*__pycache__*
# Python virtual environments
.venv
# temporary files related
temp
# NextFlow
work/
.nextflow.log
.nextflow*
out/
trace*.txt
# Macos
.DS_Store
# vscode
.vscode/launch.json
.vscode/settings.json

24
.pre-commit-config.yaml Normal file
View File

@@ -0,0 +1,24 @@
repos:
- repo: https://github.com/astral-sh/ruff-pre-commit
# Ruff version.
rev: v0.12.1
hooks:
- id: ruff-check
args: [ --fix ]
- id: ruff-format
- repo: local
hooks:
- id: run_styler
name: run_styler
language: r
description: style files with {styler}
entry: "Rscript -e 'styler::style_file(commandArgs(TRUE))'"
files: '(\.[rR]profile|\.[rR]|\.[rR]md|\.[rR]nw|\.[qQ]md)$'
additional_dependencies:
- styler
- knitr
- repo: https://github.com/lorenzwalthert/precommit
rev: v0.4.3.9012
hooks:
- id: lintr

19
CHANGELOG.md Normal file
View File

@@ -0,0 +1,19 @@
# openpipeline_spatial 0.0.0
## NEW FUNCTIONALITY
* `filter/subset_cosmx`: Added a component to subset COSMX data (PR #3, PR #9).
* `convert/from_cosmx_to_h5mu`: Added converter component for COSMX data (PR #3, PR #9).
* `mapping/spaceranger_count`: Added a spaceranger count component (PR #2).
* `convert/from_spatialdata_to_h5mu`, `convert/from_xenium_to_spatialdata`, `convert/from_xenium_to_h5mu`: Added converter components for xenium data (PR #1, #10).
* `convert/from_xenium_to_spatialexperiment`, `convert/from_cosmx_to_spatialexperiment`: Added converter components for Xenium or CosMx data to SpatialExperiment objects (PR #9).
* `convert/from_cells2stats_to_h5mu`: Added a component to convert data resulting from Aviti Teton sequencers processed by Cells2Stats into an H5MU file (PR #15).
* `workflows/qc/qc`: Added a pipeline for calculating qc metrics of spatial omics samples (PR #5).
* `workflows/multiomics/spatial_process_samples`: Added a pipeline to pre-process multiple spatial omics samples (PR #7).

27
_viash.yaml Normal file
View File

@@ -0,0 +1,27 @@
viash_version: 0.9.4
source: src
target: target
name: openpipeline_spatial
organization: openpipelines-bio
links:
repository: https://github.com/openpipelines-bio/openpipeline_spatial
docker_registry: ghcr.io
repositories:
- name: openpipeline
repo: openpipelines-bio/openpipeline
type: github
tag: 2.1.2
info:
test_resources:
- type: s3
path: s3://openpipelines-bio/openpipeline_spatial/resources_test
dest: resources_test
config_mods: |
.resources += {path: '/src/workflows/utils/labels.config', dest: 'nextflow_labels.config'}
.runners[.type == 'nextflow'].config.script := 'includeConfig("nextflow_labels.config")'

0
main.nf Normal file
View File

0
nextflow.config Normal file
View File

View File

@@ -0,0 +1,116 @@
#!/bin/bash
set -eo pipefail
# get the root of the directory
REPO_ROOT=$(git rev-parse --show-toplevel)
# ensure that the command below is run from the root of the repository
cd "$REPO_ROOT"
ID=aviti
DIR=resources_test/$ID/
OUT=$DIR/teton_cells2stats_tiny/
# Create directories
[ -d "$DIR" ] || mkdir -p "$DIR"
[ -d "$OUT" ] || mkdir -p "$OUT"
echo "> Downloading Aviti Teton data"
wget "https://go.elementbiosciences.com/l/938263/28kddnj7/d59cp" -O "${DIR}/PLUT-0105.tar.gz"
tar -xzf "${DIR}/PLUT-0105.tar.gz" -C "$DIR"
rm "${DIR}/PLUT-0105.tar.gz"
echo "> Processing and subsetting Aviti Teton data"
python <<HEREDOC
import os
import shutil
import pandas as pd
import glob
import json
src_dir = "${DIR}/PLUT-0105"
dest_dir = "${OUT}"
subset_image_dirs = False
wells_to_keep = ["A1"]
max_cells_per_well = 1000
os.makedirs(dest_dir, exist_ok=True)
print(f"Processing data from {src_dir} to {dest_dir}")
# Copy images
if subset_image_dirs:
image_dirs = ["CellSegmentation", "Projection"]
for image_dir in image_dirs:
image_dir_path = os.path.join(src_dir, image_dir)
if not os.path.exists(image_dir_path):
print(f"Warning: Image directory not found: {image_dir_path}")
continue
if not os.path.isdir(image_dir_path):
print(f"Warning: Path exists but is not a directory: {image_dir_path}")
continue
print(f"Processing image directory: {image_dir}")
for well in wells_to_keep:
dest_path = f"{dest_dir}/{image_dir}/Well{well}"
os.makedirs(dest_path, exist_ok=True)
src_path = glob.glob(os.path.join(src_dir, image_dir, f"Well{well}"))
if len(src_path) != 1:
print(f"Warning: Expected 1 path for Well{well}, found {len(src_path)}")
continue
shutil.copytree(src_path[0], os.path.join(dest_path), dirs_exist_ok=True)
# Copy count matrix
src_path = os.path.join(src_dir, "Cytoprofiling", "Instrument", "RawCellStats.parquet")
if os.path.exists(src_path):
print(f"Processing count matrix: {src_path}")
df = pd.read_parquet(src_path)
print(f"Original data: {len(df)} rows")
# Filter by wells
df = df[df["Well"].isin(wells_to_keep)]
print(f"After well filtering: {len(df)} rows")
if max_cells_per_well:
# Limit the number of cells per well
df = df.head(max_cells_per_well)
print(f"After cell limit: {len(df)} rows")
dest_path = os.path.join(dest_dir, "Cytoprofiling", "Instrument")
os.makedirs(dest_path, exist_ok=True)
dest_file = os.path.join(dest_path, "RawCellStats.parquet")
df.to_parquet(dest_file, engine="pyarrow")
print(f"Saved processed count matrix to {dest_file}")
else:
print(f"Warning: Count matrix not found at {src_path}")
# Copy Panel Metadata
panel_src_path = os.path.join(src_dir, "Panel.json")
if os.path.exists(panel_src_path):
panel_dest_path = os.path.join(dest_dir, "Panel.json")
shutil.copy2(panel_src_path, panel_dest_path)
print(f"Copied Panel.json")
else:
print(f"Warning: Panel.json not found at {panel_src_path}")
print("Processing complete!")
HEREDOC
echo "> Removing original aviti_teton folder"
rm -rf "$DIR/PLUT-0105"
echo "> Aviti Teton tiny dataset created successfully at $OUT"
viash run src/convert/from_cells2stats_to_h5mu/config.vsh.yaml -- \
--input "${OUT}" \
--output "$DIR/aviti_teton_tiny.h5mu" \
--output_compression "gzip"
echo "> Conversion to H5MU complete"
aws s3 sync \
--profile di \
"$DIR" \
s3://openpipelines-bio/openpipeline_spatial/resources_test/aviti \
--delete \
--dryrun

View File

@@ -0,0 +1,52 @@
#!/bin/bash
set -eo pipefail
# get the root of the directory
REPO_ROOT=$(git rev-parse --show-toplevel)
# ensure that the command below is run from the root of the repository
cd "$REPO_ROOT"
DIR="resources_test/cosmx"
ID="Lung5_Rep2"
OUT="$DIR/$ID/"
# create tempdir
MY_TEMP="${VIASH_TEMP:-/tmp}"
TMPDIR=$(mktemp -d "$MY_TEMP/$ID-XXXXXX")
function clean_up {
[[ -d "$TMPDIR" ]] && rm -r "$TMPDIR"
}
trap clean_up EXIT
if [ ! -d "$OUT" ]; then
flat_dataset="https://nanostring-public-share.s3.us-west-2.amazonaws.com/SMI-Compressed/Lung5_Rep2/Lung5_Rep2+SMI+Flat+data.tar.gz"
wget "$flat_dataset" -O "$TMPDIR/Lung5_Rep2.tar.gz"
mkdir -p "$TMPDIR/Lung5_Rep2"
tar -xzf "$TMPDIR/Lung5_Rep2.tar.gz" -C "$TMPDIR/Lung5_Rep2"
mkdir -p "$OUT"
mv "$TMPDIR/Lung5_Rep2/Lung5_Rep2/Lung5_Rep2-Flat_files_and_images/"* "$OUT/"
fi
viash run src/filter/subset_cosmx/config.vsh.yaml -- \
--input "$OUT" \
--num_fovs 3 \
--subset_transcripts_file True \
--subset_polygons_file False \
--output "${DIR}/${ID}_tiny"
viash run src/convert/from_cosmx_to_h5mu/config.vsh.yaml -- \
--input ${DIR}/${ID}_tiny \
--output "$DIR/${ID}_tiny.h5mu" \
--output_compression "gzip"
rm -rf "$OUT"
# Sync to S3
aws s3 sync \
--profile di \
"$DIR" \
s3://openpipelines-bio/openpipeline_spatial/resources_test/cosmx \
--delete \
--dryrun

View File

@@ -0,0 +1,19 @@
#!/bin/bash
set -eo pipefail
# get the root of the directory
REPO_ROOT=$(git rev-parse --show-toplevel)
# ensure that the command below is run from the root of the repository
cd "$REPO_ROOT"
DIR="resources_test/GRCh38"
mkdir -p $DIR
aws s3 sync \
--profile di \
s3://openpipelines-bio/openpipeline_spatial/resources_test/GRCh38 \
"$DIR" \
--delete \
--dryrun

View File

@@ -0,0 +1,35 @@
#!/bin/bash
set -eo pipefail
# get the root of the directory
REPO_ROOT=$(git rev-parse --show-toplevel)
# Define absolute directory path
DIR="$REPO_ROOT/resources_test/visium"
# from https://www.10xgenomics.com/resources/datasets/human-ovarian-cancer-1-standard
mkdir -p "$DIR"
# Input Files - download to the specific directory
curl -o "$DIR/Visium_FFPE_Human_Ovarian_Cancer_fastqs.tar" https://cf.10xgenomics.com/samples/spatial-exp/1.3.0/Visium_FFPE_Human_Ovarian_Cancer/Visium_FFPE_Human_Ovarian_Cancer_fastqs.tar
curl -o "$DIR/Visium_FFPE_Human_Ovarian_Cancer_image.jpg" https://cf.10xgenomics.com/samples/spatial-exp/1.3.0/Visium_FFPE_Human_Ovarian_Cancer/Visium_FFPE_Human_Ovarian_Cancer_image.jpg
curl -o "$DIR/Visium_FFPE_Human_Ovarian_Cancer_probe_set.csv" https://cf.10xgenomics.com/samples/spatial-exp/1.3.0/Visium_FFPE_Human_Ovarian_Cancer/Visium_FFPE_Human_Ovarian_Cancer_probe_set.csv
# Extract in the specific directory
tar xvf "$DIR/Visium_FFPE_Human_Ovarian_Cancer_fastqs.tar" -C "$DIR"
# Create subsampled dataset with ImageMagick
# https://imagemagick.org/index.php
mkdir -p "$DIR/subsampled"
convert "$DIR/Visium_FFPE_Human_Ovarian_Cancer_image.jpg" -resize 2000x2000 "$DIR/subsampled/Visium_FFPE_Human_Ovarian_Cancer_image.jpg"
for f in "$DIR"/Visium_FFPE_Human_Ovarian_Cancer_fastqs/*L001*R*; do
gzip -cdf "$f" | head -n 40000 | gzip -c > "$DIR/subsampled/$(basename "$f")";
done
aws s3 sync \
--profile di \
"$DIR" \
s3://openpipelines-bio/openpipeline_spatial/resources_test/visium \
--delete \
--dryrun

View File

@@ -0,0 +1,44 @@
#!/bin/bash
set -eo pipefail
# get the root of the directory
REPO_ROOT=$(git rev-parse --show-toplevel)
# Define absolute directory paths
DIR="$REPO_ROOT/resources_test/xenium"
ID="xenium_tiny"
OUT="$DIR/$ID"
# create tempdir
MY_TEMP="${VIASH_TEMP:-/tmp}"
TMPDIR=$(mktemp -d "$MY_TEMP/$ID-XXXXXX")
function clean_up {
[[ -d "$TMPDIR" ]] && rm -r "$TMPDIR"
}
trap clean_up EXIT
if [ ! -d "$OUT" ]; then
tiny_dataset="https://raw.githubusercontent.com/nf-core/test-datasets/spatialxe/Xenium_Prime_Mouse_Ileum_tiny_outs.zip"
wget "$tiny_dataset" -O "$TMPDIR/xenium_tiny.zip"
unzip -q "$TMPDIR/xenium_tiny.zip" -d "$TMPDIR/xenium_tiny"
mkdir -p "$OUT"
mv "$TMPDIR/xenium_tiny/Xenium_Prime_Mouse_Ileum_tiny_outs/"* "$OUT/"
fi
viash run "$REPO_ROOT/src/convert/from_xenium_to_spatialdata/config.vsh.yaml" -- \
--input "$OUT" \
--output "$DIR/$ID.zarr"
viash run "$REPO_ROOT/src/convert/from_spatialdata_to_h5mu/config.vsh.yaml" -- \
--input "$DIR/$ID.zarr" \
--output "$DIR/$ID.h5mu"
# Sync to S3
aws s3 sync \
--profile di \
"$DIR" \
s3://openpipelines-bio/openpipeline_spatial/resources_test/xenium \
--delete \
--dryrun

43
ruff.toml Normal file
View File

@@ -0,0 +1,43 @@
# Exclude a variety of commonly ignored directories.
exclude = [
".git",
".pyenv",
".pytest_cache",
".ruff_cache",
".venv",
".vscode",
"__pypackages__",
"_build",
"build",
"dist",
"node_modules",
"site-packages",
]
builtins = ["meta"]
[format]
# Like Black, use double quotes for strings.
quote-style = "double"
# Like Black, indent with spaces, rather than tabs.
indent-style = "space"
# Like Black, respect magic trailing commas.
skip-magic-trailing-comma = false
# Like Black, automatically detect the appropriate line ending.
line-ending = "auto"
[lint.flake8-pytest-style]
fixture-parentheses = false
mark-parentheses = false
[lint]
ignore = [
# module level import not at top of file
"E402"
]

View File

@@ -0,0 +1,11 @@
name: Dorien Roosen
info:
role: Core Team Member
links:
email: dorien@data-intuitive.com
github: dorien-er
linkedin: dorien-roosen
organizations:
- name: Data Intuitive
href: https://www.data-intuitive.com
role: Data Scientist

View File

@@ -0,0 +1,12 @@
name: Dries Schaumont
info:
role: Core Team Member
links:
email: dries@data-intuitive.com
github: DriesSchaumont
orcid: "0000-0002-4389-0440"
linkedin: dries-schaumont
organizations:
- name: Data Intuitive
href: https://www.data-intuitive.com
role: Data Scientist

View File

@@ -0,0 +1,11 @@
name: Jakub Majercik
info:
role: Contributor
links:
email: jakub@data-intuitive.com
github: jakubmajercik
linkedin: jakubmajercik
organizations:
- name: Data Intuitive
href: https://www.data-intuitive.com
role: Bioinformatics Engineer

View File

@@ -0,0 +1,15 @@
name: Robrecht Cannoodt
info:
role: Core Team Member
links:
email: robrecht@data-intuitive.com
github: rcannood
orcid: "0000-0003-3641-729X"
linkedin: robrechtcannoodt
organizations:
- name: Data Intuitive
href: https://www.data-intuitive.com
role: Data Science Engineer
- name: Open Problems
href: https://openproblems.bio
role: Core Member

View File

@@ -0,0 +1,6 @@
name: Weiwei Schultz
info:
role: Contributor
organizations:
- name: Janssen R&D US
role: Associate Director Data Sciences

View File

@@ -0,0 +1,9 @@
arguments:
- name: "--output_compression"
description: |
Compression format to use for the output AnnData and/or Mudata objects.
By default no compression is applied.
type: string
choices: ["gzip", "lzf"]
required: false
example: "gzip"

View File

@@ -0,0 +1,2 @@
packages:
- anndata~=0.11.1

View File

@@ -0,0 +1,5 @@
__merge__: [/src/base/requirements/anndata.yaml, .]
packages:
- mudata~=0.3.1
script: |
exec("try:\n import awkward\nexcept ModuleNotFoundError:\n exit(0)\nelse: exit(1)")

View File

@@ -0,0 +1,2 @@
github:
- openpipelines-bio/core#subdirectory=packages/python/openpipeline_testutils

View File

@@ -0,0 +1,8 @@
test_setup:
- type: apt
packages:
- git
- type: python
__merge__:
- /src/base/requirements/viashpy.yaml
- /src/base/requirements/openpipeline_testutils.yaml

View File

@@ -0,0 +1,2 @@
packages:
- scanpy~=1.10.4

View File

@@ -0,0 +1,3 @@
__merge__: [/src/base/requirements/spatialdata.yaml, .]
packages:
- spatialdata-io~=0.2.0

View File

@@ -0,0 +1,2 @@
packages:
- spatialdata~=0.4.1rc

View File

@@ -0,0 +1,3 @@
__merge__: [/src/base/requirements/spatialdata.yaml, .]
packages:
- squidpy~=1.6.5

View File

@@ -0,0 +1,2 @@
packages:
- viashpy==0.9.0

View File

@@ -0,0 +1,128 @@
name: from_cells2stats_to_h5mu
namespace: convert
scope: public
description: |
Convert spatial data resulting from Aviti Teton sequencers that have been processed by the Element Biosciences cells2stats workflow to H5MU format.
This component processes cells2stats count matrices to create a standardized H5MU file for downstream analysis.
The component reads:
- Parquet file containing the count matrix and metadata
- Panel.json with target and batch information
And outputs an H5MU file with:
- Count data as the main .X matrix
- Spatial coordinates in obsm
- Cell Paint intensities in obsm (optional)
- Nuclear count data as a layer (optional)
- CellProfiler morphology metrics in obsm (optional)
- Unassigned targets in obsm (optional)
authors:
- __merge__: /src/authors/dorien_roosen.yaml
roles: [ maintainer ]
argument_groups:
- name: Inputs
arguments:
- name: --input
type: file
direction: input
required: true
description: |
Path to the cells2stats output bundle.
Expected folder structure (showing required files only):
├── Cytoprofiling/
│ └── Instrument/
│ └── RawCellStats.parquet
└── Panel.json
example: path/to/aviti_output/
- name: Outputs
arguments:
- name: --output
type: file
direction: output
required: true
description: Output H5MU file path.
example: output.h5mu
__merge__: [., /src/base/h5_compression_argument.yaml]
- name: Options
arguments:
- name: --modality
type: string
default: rna
description: The modality to which the processed data will be written to in the H5MU file.
- name: --obsm_coordinates
type: string
description: |
Key name to store the spatial coordinates (in pixels) in obsm.
If present, spatial coordinates in micrometers will be stored under {obsm_coordinates}_um.
The column names will be stored in uns.
default: spatial
- name: --layer_nuclear_counts
type: string
description: |
Name for nuclear counts layer. If specified, nuclear count data
will be stored as a separate layer in the AnnData object.
example: nuclear_counts
- name: --obsm_cell_paint
type: string
description: |
Key name for storing Cell Paint target intensities in obsm.
If provided, Cell Paint target intensity data will be stored as a separate matrix in the obsm field.
The column names will be stored in uns.
example: cell_paint
- name: --obsm_cell_paint_nuclear
type: string
description: |
Key name for storing Nuclear Cell Paint target intensities in obsm.
If provided, Nuclear Cell Paint target intensity data will be stored as a separate matrix in the obsm field.
The column names will be stored in uns.
example: cell_paint_nuclear
- name: --obsm_cell_profiler
type: string
description: |
Key name for storing CellProfiler morphology metrics in obsm.
If provided, CellProfiler morphology metrics will be stored as a separate matrix in the obsm field.
The column names will be stored in uns.
example: cell_profiler
- name: --obsm_unassigned_targets
type: string
description: |
Key name for storing any unassigned target data in obsm.
If provided, unassigned target data will be stored as a separate matrix in the obsm field.
The column names will be stored in uns.
example: cell_profiler
resources:
- type: python_script
path: script.py
- path: /src/utils/setup_logger.py
test_resources:
- type: python_script
path: test.py
- path: /resources_test/aviti/
engines:
- type: docker
image: python:3.13-slim
setup:
- type: apt
packages:
- procps
- type: python
__merge__: [/src/base/requirements/anndata_mudata.yaml, .]
packages:
- pyarrow
test_setup:
- type: python
__merge__: [ /src/base/requirements/viashpy.yaml, .]
runners:
- type: executable
- type: nextflow
directives:
label: [lowmem, lowcpu]

View File

@@ -0,0 +1,285 @@
import sys
from pathlib import Path
import scipy.sparse as sp
import pandas as pd
import mudata as mu
import anndata as ad
import re
import json
## VIASH START
par = {
"input": "./resources_test/aviti/aviti_teton_tiny_2",
"modality": "rna",
"output": "aviti_tiny_test.h5mu",
"output_compression": "gzip",
"layer_nuclear_counts": "nuclear_counts",
"obsm_coordinates": "spatial",
"obsm_cell_paint": "cell_paint",
"obsm_cell_paint_nuclear": "cell_paint_nuclear",
"obsm_cell_profiler": "cell_profiler",
"obsm_unassigned_targets": "unassigned_targets",
}
meta = {"resources_dir": "src/utils"}
## VIASH END
sys.path.append(meta["resources_dir"])
from setup_logger import setup_logger
logger = setup_logger()
def assert_matching_order(var_names, count_columns, split_pattern=None):
for var, col in zip(var_names, count_columns):
count_var = col if not split_pattern else col.split("_Nuclear")[0]
assert var == count_var, "Orders do not match"
def categorize_columns(column_list, target_panel):
# Extract imaging and barcoding information from Panel.json
imaging_batches = [tube["BatchName"] for tube in target_panel["ImagingPrimerTubes"]]
barcoding_batches = [
tube["BatchName"] for tube in target_panel["BarcodingPrimerTubes"]
]
# Extract target information
cellpaint_targets = [target["Target"] for target in target_panel["ImagingTargets"]]
barcoding_targets = [
target["Target"] for target in target_panel["BarcodingTargets"]
]
# METADATA (for .obs and .obsm)
# Fixed columns
columns_fixed = [
"Area",
"AreaUm",
"Cell",
"NuclearArea",
"NuclearAreaUm",
"Tile",
"Well",
"WellLabel",
]
obs_columns_fixed = list(set(columns_fixed) & set(column_list))
# Coordinate columns
coordinate_columns = ["X", "Y", "Xum", "Yum"]
obsm_coordinate_columns = list(set(coordinate_columns) & set(column_list))
# Cell Paint target intensity columns (format: {cell_paint_target.batch})
cell_paint_columns = [
col
for col in column_list
if any(
col.startswith(f"{target}.") and col.endswith(f".{batch}")
for target in cellpaint_targets
for batch in imaging_batches
)
]
# Cell Paint nuclear target intensity columns (format: {cell_paint_target_Nuclear.batch})
cell_paint_nuclear_columns = [
col
for col in column_list
if any(
col.startswith(f"{target}_Nuclear") and col.endswith(f".{batch}")
for target in cellpaint_targets
for batch in imaging_batches
)
]
# CellProfiler morphology metrics
morphology_patterns = [
r"^AreaShape_",
r"^Granularity_",
r"^Texture_",
r"^Intensity_",
r"^Location_",
r"^RadialDistribution_",
]
cell_profiler_columns = [
col
for col in column_list
for pattern in morphology_patterns
if re.match(pattern, col)
]
# COUNT MATRICES (for .X and layers)
# Feature Count Matrix - barcoding targets (format: {target.batch})
# Includes cellular and nuclear counts
count_columns = [
col
for col in column_list
if any(
col.startswith(f"{target}.") and col.endswith(f".{batch}")
for target in barcoding_targets
for batch in barcoding_batches
)
]
# Nuclear Feature Count Matrix - barcoding targets (format: {target_Nuclear.batch})
# Includes only nuclear counts
nuclear_count_columns = [
col
for col in column_list
if any(
col.startswith(f"{target}_Nuclear") and col.endswith(f".{batch}")
for target in barcoding_targets
for batch in barcoding_batches
)
]
# Unassigned columns (format: {Unassigned_*.*})
unassigned_columns = [col for col in column_list if col.startswith("Unassigned")]
# Make sure all columns have been categorized and have expected sizes
assert len(count_columns) == len(nuclear_count_columns), (
"Cellular and nuclear count columns do not match."
)
all_categorized_columns = (
obs_columns_fixed
+ obsm_coordinate_columns
+ cell_paint_columns
+ cell_paint_nuclear_columns
+ cell_profiler_columns
+ count_columns
+ nuclear_count_columns
+ unassigned_columns
)
assert len(column_list) == len(all_categorized_columns), (
"Column categorization incomplete."
)
return (
obs_columns_fixed,
obsm_coordinate_columns,
cell_paint_columns,
cell_paint_nuclear_columns,
cell_profiler_columns,
count_columns,
nuclear_count_columns,
unassigned_columns,
)
def main():
# Read data from Aviti Teton output bundle
# Expected folder structure (showing only relevant files):
# ├── Cytoprofiling/
# │ └── Instrument/
# │ └── RawCellStats.parquet
# └── Panel.json
logger.info("Reading input data...")
input_dir = Path(par["input"])
input_data = {
"count_matrix": input_dir
/ "Cytoprofiling"
/ "Instrument"
/ "RawCellStats.parquet",
"target_panel": input_dir / "Panel.json",
}
assert all([file.exists() for file in input_data.values()]), (
f"Not all required input files are found. Make sure that {par['input']} contains {input_data.values()}."
)
with open(input_data["target_panel"], "r") as f:
target_panel = json.load(f)
df = pd.read_parquet(input_data["count_matrix"], engine="pyarrow")
df_columns = df.columns.tolist()
logger.info("Categorizing input data...")
(
obs_columns_fixed,
coordinate_columns,
cell_paint_columns,
cell_paint_nuclear_columns,
cell_profiler_columns,
count_columns,
nuclear_count_columns,
unassigned_columns,
) = categorize_columns(df_columns, target_panel)
df = df.set_index(df["Cell"].astype(str), drop=False)
df.index_name = None
# var and obs names
var_names = [var.split(".")[0] for var in count_columns]
obs_names = df["Cell"].astype(str).tolist()
# Count matrix
logger.info("Creating count matrix...")
count_df = df[count_columns].copy()
count_matrix_sparse = sp.csr_matrix(count_df.values)
# Obs field
logger.info(f"Creating obs field with columns {obs_columns_fixed}")
obs_df = df[obs_columns_fixed].copy()
# Create AnnData object
logger.info("Creating AnnData object...")
adata = ad.AnnData(
X=count_matrix_sparse,
obs=obs_df,
var=pd.DataFrame(index=var_names),
)
adata.obs_names = obs_names
adata.var_names = var_names
# Spatial coordinates
coordinate_sets = {
par["obsm_coordinates"]: ["X", "Y"],
f"{par['obsm_coordinates']}_um": ["Xum", "Yum"],
}
for obsm_key, coord_cols in coordinate_sets.items():
if all(col in coordinate_columns for col in coord_cols):
coordinates = df[coord_cols].copy()
adata.obsm[obsm_key] = coordinates.values
adata.uns[obsm_key] = coord_cols
logger.info(f"Added {obsm_key} coordinates ({coord_cols}) to obsm")
else:
missing_cols = [col for col in coord_cols if col not in coordinate_columns]
logger.warning(
f"Skipping {obsm_key}: missing coordinate columns {missing_cols}"
)
# Add (optional) .obsm fields
if par["obsm_cell_paint"]:
logger.info(f"Adding {par['obsm_cell_paint']} to obsm")
adata.obsm[par["obsm_cell_paint"]] = df[cell_paint_columns].copy()
adata.uns[par["obsm_cell_paint"]] = cell_paint_columns
if par["obsm_cell_paint_nuclear"]:
logger.info(f"Adding {par['obsm_cell_paint_nuclear']} to obsm")
adata.obsm[par["obsm_cell_paint_nuclear"]] = df[
cell_paint_nuclear_columns
].copy()
adata.uns[par["obsm_cell_paint_nuclear"]] = cell_paint_nuclear_columns
if par["obsm_cell_profiler"]:
logger.info(f"Adding {par['obsm_cell_profiler']} to obsm")
adata.obsm[par["obsm_cell_profiler"]] = df[cell_profiler_columns].copy()
adata.uns[par["obsm_cell_profiler"]] = cell_profiler_columns
if par["obsm_unassigned_targets"]:
logger.info(f"Adding {par['obsm_unassigned_targets']} to obsm")
adata.obsm["unassigned_targets"] = df[unassigned_columns].copy()
adata.uns["unassigned_targets"] = unassigned_columns
# Add (optional) nuclear count layer
if par["layer_nuclear_counts"]:
assert_matching_order(
var_names, nuclear_count_columns, split_pattern="_Nuclear"
)
logger.info(f"Adding {par['layer_nuclear_counts']} to layers")
nuclear_count_df = df[nuclear_count_columns].copy()
nuclear_count_matrix_sparse = sp.csr_matrix(nuclear_count_df.values)
adata.layers[par["layer_nuclear_counts"]] = nuclear_count_matrix_sparse
# Write output MuData
logger.info("Writing MuData object...")
mdata = mu.MuData({par["modality"]: adata})
mdata.write_h5mu(par["output"], compression=par["output_compression"])
if __name__ == "__main__":
main()

View File

@@ -0,0 +1,106 @@
import pytest
import sys
import mudata as mu
## VIASH START
meta = {
"executable": "./target/executable/convert/from_cells2stats_to_h5mu/from_cells2stats_to_h5mu",
"resources_dir": "resources_test/aviti/",
}
## VIASH END
input = f"{meta['resources_dir']}/aviti/teton_cells2stats_tiny/"
def test_simple_execution(run_component, tmp_path):
output = tmp_path / "aviti.h5mu"
# run component
run_component(
["--input", input, "--output", str(output), "--output_compression", "gzip"]
)
assert output.is_file(), "output file was not created"
mdata = mu.read_h5mu(output)
assert list(mdata.mod.keys()) == ["rna"], "Expected modality rna"
adata = mdata.mod["rna"]
assert adata.X.dtype.kind == "f"
expected_obs_keys = [
"AreaUm",
"Area",
"Tile",
"WellLabel",
"Well",
"Cell",
"NuclearAreaUm",
"NuclearArea",
]
assert all([obs in expected_obs_keys for obs in adata.obs.columns])
obs_counts = ["Area", "Cell", "NuclearArea"]
assert all([adata.obs[obs].dtype.kind == "u" for obs in obs_counts])
obs_areas = ["AreaUm", "NuclearAreaUm"]
assert all([adata.obs[obs].dtype.kind == "f" for obs in obs_areas])
obs_categories = ["Tile", "WellLabel", "Well"]
assert all([adata.obs[obs].dtype.kind == "O" for obs in obs_categories])
expected_obsm_keys = ["spatial", "spatial_um"]
assert list(adata.obsm.keys()) == expected_obsm_keys
assert list(adata.uns.keys()) == expected_obsm_keys
assert all(adata.obsm[obsm].dtype.kind == "f" for obsm in expected_obsm_keys)
def test_extended_parameters(run_component, tmp_path):
output = tmp_path / "aviti_ext.h5mu"
# run component
run_component(
[
"--input",
input,
"--modality",
"mod1",
"--output",
str(output),
"--layer_nuclear_counts",
"nuclear_counts",
"--obsm_coordinates",
"coords",
"--obsm_cell_paint",
"cell_paint",
"--obsm_cell_paint_nuclear",
"cell_paint_nuclear",
"--obsm_cell_profiler",
"cell_profiler",
"--obsm_unassigned_targets",
"unassigned_targets",
"--output_compression",
"gzip",
]
)
assert output.is_file(), "output file was not created"
mdata = mu.read_h5mu(output)
assert list(mdata.mod.keys()) == ["mod1"]
adata = mdata.mod["mod1"]
assert list(adata.layers) == ["nuclear_counts"]
assert adata.layers["nuclear_counts"].dtype.kind == "f"
expected_obsm_keys = [
"cell_paint",
"cell_paint_nuclear",
"cell_profiler",
"coords",
"coords_um",
"unassigned_targets",
]
assert list(adata.uns.keys()) == expected_obsm_keys
assert list(adata.obsm.keys()) == expected_obsm_keys
if __name__ == "__main__":
sys.exit(pytest.main([__file__]))

View File

@@ -0,0 +1,61 @@
name: "from_cosmx_to_h5mu"
namespace: "convert"
description: |
Converts the output from NanoString experiment into a MuData objcet.
- `<dataset_id>_exprMat_file.csv`: File containing the counts.
- `<dataset_id>`_metadata_file: File containing the spatial coordinates and additional cell-level metadata.
- `<dataset_id>_fov_file.csv`: File containing the coordinates of all the fields of view.
In addition to reading the regular Nanostring output, it loads CellComposite and CellLabels directories, if present,
containing the images.
authors:
- __merge__: /src/authors/dorien_roosen.yaml
roles: [ maintainer ]
- __merge__: /src/authors/weiwei_schultz.yaml
roles: [ contributor ]
arguments:
- name: "--input"
alternatives: ["-i"]
type: file
description: Input folder. Must contain the output from a NanoString CosMx run.
example: cosmx_data
direction: input
required: true
- name: "--modality"
type: string
default: rna
- name: "--output"
alternatives: ["-o"]
type: file
description: The output h5mu file.
example: "output.h5mu"
direction: output
- name: "--output_compression"
type: string
choices: ["gzip", "lzf"]
required: false
example: "gzip"
resources:
- type: python_script
path: script.py
- path: /src/utils/setup_logger.py
test_resources:
- type: python_script
path: test.py
- path: /resources_test/cosmx/Lung5_Rep2_tiny/
engines:
- type: docker
image: python:3.12-slim
setup:
- type: apt
packages:
- procps
- type: python
__merge__: [/src/base/requirements/anndata_mudata.yaml, /src/base/requirements/squidpy.yaml]
__merge__: [ /src/base/requirements/python_test_setup.yaml, .]
runners:
- type: executable
- type: nextflow
directives:
label: [lowmem, singlecpu]

View File

@@ -0,0 +1,43 @@
import sys
import os
import squidpy as sq
import mudata as mu
import glob
## VIASH START
par = {
"input": "./resources_test/cosmx/Lung5_Rep2_tiny",
"output": "./resources_test/cosmx/Lung5_Rep2_tiny.h5mu",
"modality": "rna",
"output_compression": None,
}
meta = {"resources_dir": "src/utils"}
## VIASH END
sys.path.append(meta["resources_dir"])
from setup_logger import setup_logger
logger = setup_logger()
def find_matrix_file(suffix):
pattern = os.path.join(par["input"], f"*{suffix}")
files = glob.glob(pattern)
assert len(files) == 1, (
f"Only one file matching pattern {pattern} should be present"
)
return files[0]
counts_file = find_matrix_file("exprMat_file.csv")
fov_file = find_matrix_file("fov_positions_file.csv")
meta_file = find_matrix_file("metadata_file.csv")
logger.info("Reading in CosMx data...")
adata = sq.read.nanostring(
path=par["input"], counts_file=counts_file, meta_file=meta_file, fov_file=fov_file
)
logger.info("Writing output MuData object...")
mdata = mu.MuData({par["modality"]: adata})
mdata.write_h5mu(par["output"], compression=par["output_compression"])

View File

@@ -0,0 +1,57 @@
import pytest
import sys
import mudata as mu
def test_simple_execution(run_component, tmp_path):
output = tmp_path / "cosmx_tiny.h5mu"
run_component(
[
"--input",
meta["resources_dir"] + "/Lung5_Rep2_tiny",
"--dataset_id",
"Lung5_Rep2",
"--num_fovs",
"2",
"--output",
output,
]
)
assert output.is_file(), "output file was not created"
mdata = mu.read_h5mu(output)
assert list(mdata.mod.keys()) == ["rna"], "Expected modality rna"
adata = mdata.mod["rna"]
assert list(adata.obs.keys()) == [
"fov",
"Area",
"AspectRatio",
"CenterX_global_px",
"CenterY_global_px",
"Width",
"Height",
"Mean.MembraneStain",
"Max.MembraneStain",
"Mean.PanCK",
"Max.PanCK",
"Mean.CD45",
"Max.CD45",
"Mean.CD3",
"Max.CD3",
"Mean.DAPI",
"Max.DAPI",
"cell_ID",
]
assert list(adata.uns.keys()) == ["spatial"]
assert list(adata.obsm.keys()) == ["spatial", "spatial_fov"]
assert adata.obsm["spatial"].dtype == "int"
assert adata.obsm["spatial_fov"].dtype == "float"
if __name__ == "__main__":
sys.exit(pytest.main([__file__]))

View File

@@ -0,0 +1,82 @@
name: "from_cosmx_to_spatialexperiment"
namespace: "convert"
scope: "public"
description: |
Creates a SpatialExperiment object from the downloaded unzipped CosMx directory for Nanostring
CosMx spatial gene expression data, and saves it as a SpatialExperiment object.
The constructor assumes the downloaded unzipped CosMx Folder has the following structure:
Mandatory files
· | — *_exprMat_file.csv
· | — *_metadata_file.csv
Optional files, by default added to the metadata() as a list of paths (will be converted to parquet):
· | —*_fov_positions_file.csv
· | — *_tx_file.csv
· | — *_polygons.csv
authors:
- __merge__: /src/authors/dorien_roosen.yaml
roles: [ author, maintainer ]
arguments:
- name: "--input"
alternatives: ["-i"]
type: file
description: Input CosMx directory
direction: input
required: true
example: path/to/cosmx_bundle
- name: "--add_tx_path"
type: boolean
default: true
description: |
Whether to add parquet paths to the metadata.
If True, `*_tx_file.csv` file will be converted to .parquet and added to the metadata.
- name: "--add_polygon_path"
type: boolean
default: true
description: |
Whether to add polygon path to the metadata.
If True, `*_polygons.csv` file will be converted to .parquet and be added to the metadata.
- name: "--add_fov_positions"
type: boolean
default: true
description: |
Whether to add fov positions to the metadata.
If True, `fov_positions_file.csv` will be added to the metadata.
- name: "--alternative_experiment_features"
type: string
multiple: true
description: Feature names containing these strings will be moved to altExps(sxe) slots as separate SpatialExperiment objects.
default: [NegPrb, Negative, SystemControl, FalseCode]
- name: "--output"
alternatives: ["-o"]
type: file
description: Output SpatialExperiment file
direction: output
required: true
example: output.rds
resources:
- type: r_script
path: script.R
test_resources:
- type: r_script
path: test.R
- path: /resources_test/cosmx/Lung5_Rep2_tiny
engines:
- type: docker
image: rocker/r2u:24.04
setup:
- type: apt
packages:
- libhdf5-dev
- libgeos-dev
- type: r
bioc: [ SpatialExperimentIO ]
test_setup:
- type: r
cran: [ testthat ]
runners:
- type: executable
- type: nextflow
directives:
label: [lowmem, singlecpu]

View File

@@ -0,0 +1,36 @@
library(SpatialExperimentIO)
### VIASH START
par <- list(
input = "resources_test/cosmx/Lung5_Rep2_tiny",
add_tx_path = TRUE,
add_polygon_path = FALSE,
add_fov_positions = TRUE,
alternative_experiment_features = c(
"NegPrb", "Negative", "SystemControl", "FalseCode"
),
output = "spe_cosmx_test.rds"
)
### VIASH END
if (par$add_polygon_path == FALSE && par$add_tx_path == FALSE) {
add_parquet_paths <- FALSE
} else {
add_parquet_paths <- TRUE
}
spe <- readCosmxSXE(
dirName = par$input,
returnType = "SPE",
countMatPattern = "exprMat_file.csv",
metaDataPattern = "metadata_file.csv",
coordNames = c("CenterX_global_px", "CenterY_global_px"),
addFovPos = par$add_fov_positions,
fovPosPattern = "fov_positions_file.csv",
addParquetPaths = add_parquet_paths,
addPolygon = par$add_polygon_path,
addTx = par$add_tx_path,
altExps = par$alternative_experiment_features
)
saveRDS(spe, file = par$output)

View File

@@ -0,0 +1,113 @@
library(testthat, warn.conflicts = FALSE)
library(SpatialExperimentIO)
library(SpatialExperiment)
## VIASH START
meta <- list(
executable = "./from_cosmx_to_spatialexperiment",
resources_dir = "resources_test/cosmx/",
name = "from_cosmx_to_spatialexperiment"
)
## VIASH END
cat("> Checking simple execution\n")
spe <- paste0(
meta[["resources_dir"]],
"/Lung5_Rep2_tiny"
)
out_rds <- "output.rds"
cat("> Running ", meta[["name"]], "\n", sep = "")
out <- processx::run(
meta[["executable"]],
c(
"--input", spe,
"--add_tx_path", TRUE,
"--add_polygon_path", FALSE,
"--output", out_rds
)
)
cat("> Checking whether output file exists\n")
expect_equal(out$status, 0)
expect_true(file.exists(out_rds))
cat("> Reading output file\n")
obj <- readRDS(file = out_rds)
cat("> Checking whether Seurat object is in the right format\n")
# Object type
expect_is(obj, "SpatialExperiment")
# Assay structure
expect_equal(names(slot(obj, "assays")), "counts")
# Spatial coordinates
expect_equal(
spatialCoordsNames(obj),
c("CenterX_global_px", "CenterY_global_px")
)
# Alternative experiments
expect_equal(altExpNames(obj), c("NegPrb"))
# Metadata components
expect_named(
metadata(obj),
c("fov_positions", "transcripts"),
ignore.order = TRUE
)
# Parquet paths
expect_true(grepl("\\.parquet$", metadata(obj)[["transcripts"]]))
# Dimensions
input <- readCosmxSXE(
dirName = spe,
addParquetPaths = FALSE,
returnType = "SPE"
)
dim_rds <- dim(obj)
dim_input <- dim(input)
expect_equal(dim_rds, dim_input)
cat("> Checking parameter functionality\n")
out_rds_ext <- "output_ext.rds"
cat("> Running ", meta[["name"]], "\n", sep = "")
out_ext <- processx::run(
meta[["executable"]],
c(
"--input", spe,
"--add_fov_positions", FALSE,
"--add_tx_path", FALSE,
"--add_polygon_path", FALSE,
"--alternative_experiment_features", c("Negative"),
"--output", out_rds_ext
)
)
cat("> Checking whether output file exists\n")
expect_equal(out_ext$status, 0)
expect_true(file.exists(out_rds_ext))
cat("> Reading output file\n")
obj_ext <- readRDS(file = out_rds_ext)
cat("> Checking whether Seurat object is in the right format\n")
# Object type
expect_is(obj_ext, "SpatialExperiment")
# Assay structure
expect_equal(names(slot(obj_ext, "assays")), "counts")
# Spatial coordinates
expect_equal(
spatialCoordsNames(obj_ext),
c("CenterX_global_px", "CenterY_global_px")
)
# Alternative experiments
expect_length(altExpNames(obj_ext), 0)
# Metadata components
expect_length(metadata(obj_ext), 0)
dim_rds_ext <- dim(obj_ext)
expect_true(identical(dim_rds_ext[2], dim_input[2]))
expect_false(identical(dim_rds_ext[1], dim_input[1]))

View File

@@ -0,0 +1,55 @@
name: "from_spatialdata_to_h5mu"
namespace: "convert"
description: |
Reads in the Tables field stored in a SpatialData object and converts it to an h5mu file.
authors:
- __merge__: /src/authors/dorien_roosen.yaml
roles: [ maintainer ]
- __merge__: /src/authors/weiwei_schultz.yaml
roles: [ contributor ]
arguments:
- name: "--input"
alternatives: ["-i"]
type: file
description: Input zarr folder where the SpatialData object is stored.
example: input.zarr
direction: input
required: true
- name: "--modality"
type: string
default: rna
- name: "--output"
alternatives: ["-o"]
type: file
description: The output h5mu file.
example: "output.h5mu"
direction: output
- name: "--output_compression"
type: string
choices: ["gzip", "lzf"]
required: false
example: "gzip"
resources:
- type: python_script
path: script.py
- path: /src/utils/setup_logger.py
test_resources:
- type: python_script
path: test.py
- path: /resources_test/xenium/xenium_tiny.zarr
engines:
- type: docker
image: python:3.12-slim
setup:
- type: apt
packages:
- procps
- type: python
__merge__: [/src/base/requirements/anndata_mudata.yaml, /src/base/requirements/spatialdata.yaml]
__merge__: [ /src/base/requirements/python_test_setup.yaml, .]
runners:
- type: executable
- type: nextflow
directives:
label: [lowmem, singlecpu]

View File

@@ -0,0 +1,28 @@
import sys
import spatialdata as sd
import mudata as mu
## VIASH START
par = {
"input": "./resources_test/xenium/xenium_tiny.zarr",
"output": "./resources_test/xenium/xenium_tiny.h5mu",
"modality": "rna",
"output_compression": None,
}
meta = {"resources_dir": "src/utils"}
## VIASH END
sys.path.append(meta["resources_dir"])
from setup_logger import setup_logger
logger = setup_logger()
logger.info("Reading in Xenium data...")
sdata = sd.read_zarr(par["input"])
logger.info("Fetching AnnData table from SpatialData object...")
adata = sdata.tables["table"]
logger.info("Writing output MuData object...")
mdata = mu.MuData({par["modality"]: adata})
mdata.write_h5mu(par["output"], compression=par["output_compression"])

View File

@@ -0,0 +1,52 @@
import pytest
import sys
import mudata as mu
def test_simple_execution(run_component, tmp_path):
output = tmp_path / "output.h5mu"
run_component(
[
"--input",
meta["resources_dir"] + "/xenium_tiny.zarr",
"--output",
output,
]
)
assert output.is_file(), "output file was not created"
mdata = mu.read_h5mu(output)
assert list(mdata.mod.keys()) == ["rna"], "Expected modality rna"
adata = mdata.mod["rna"]
# TODO: update what is checked here when spatialdata from other experimental set-ups are tested (e.g. cosmx, visium)
assert list(adata.obs.keys()) == [
"cell_id",
"transcript_counts",
"control_probe_counts",
"genomic_control_counts",
"control_codeword_counts",
"unassigned_codeword_counts",
"deprecated_codeword_counts",
"total_counts",
"cell_area",
"nucleus_area",
"nucleus_count",
"segmentation_method",
"region",
"z_level",
"cell_labels",
]
assert list(adata.uns.keys()) == ["spatialdata_attrs"]
assert list(adata.obsm.keys()) == ["spatial"]
assert list(adata.var.keys()) == ["gene_ids", "feature_types", "genome"]
assert all(adata.var["feature_types"] == "Gene Expression")
assert adata.obsm["spatial"].dtype == "float"
if __name__ == "__main__":
sys.exit(pytest.main([__file__]))

View File

@@ -0,0 +1,69 @@
name: "from_xenium_to_h5mu"
namespace: "convert"
scope: "public"
description: |
Converts the output from Xenium to a single .h5mu file, where the count matrix is written to the `rna` modality.
The following files are expected to be present in the Xenium output bundle:
├── cell_feature_matrix.h5
├── cells.parquet
├── experiment.xenium
└── metrics_summary.csv
authors:
- __merge__: /src/authors/dorien_roosen.yaml
roles: [ maintainer ]
arguments:
- name: "--input"
alternatives: ["-i"]
type: file
description: Input folder. Must contain the output from a Xenium run.
example: xenium_output_bundle
direction: input
required: true
- name: "--output"
alternatives: ["-o"]
type: file
description: Output .h5mu file.
example: "xenium.h5mu"
direction: output
- name: "--obsm_coordinates"
type: string
description: Name of the .obsm slot under which to store the cell centroid coordinates.
default: "spatial"
- name: "--uns_experiment"
type: string
description: Name of the .uns slot under which to store the Xenium experiment specifications.
default: "xenium_experiment"
- name: "--uns_metrics"
type: string
description: Name of the .uns slot under which to store the summary QC metrics.
default: "xenium_metrics"
__merge__: [., /src/base/h5_compression_argument.yaml]
resources:
- type: python_script
path: script.py
- path: /src/utils/setup_logger.py
test_resources:
- type: python_script
path: test.py
- path: /resources_test/xenium/xenium_tiny
engines:
- type: docker
image: python:3.12-slim
setup:
- type: apt
packages:
- procps
- type: python
__merge__: [/src/base/requirements/anndata_mudata.yaml, /src/base/requirements/scanpy.yaml, .]
packages:
- pyarrow
test_setup:
- type: python
__merge__: [ /src/base/requirements/viashpy.yaml, .]
runners:
- type: executable
- type: nextflow
directives:
label: [lowmem, singlecpu]

View File

@@ -0,0 +1,75 @@
import sys
from pathlib import Path
import scanpy as sc
import pandas as pd
import mudata as mu
import json
## VIASH START
par = {
"input": "./resources_test/xenium/xenium_tiny",
"output": "xenium_tiny_test.h5mu",
"output_compression": "gzip",
"obsm_coordinates": "spatial",
"uns_experiment": "xenium_experiment",
"uns_metrics": "xenium_metrics",
}
meta = {"resources_dir": "src/utils"}
## VIASH END
sys.path.append(meta["resources_dir"])
from setup_logger import setup_logger
logger = setup_logger()
# Expected folder structure (showing only relevant files):
# ├── cell_feature_matrix.h5
# ├── cells.parquet
# ├── experiment.xenium
# └── metrics_summary.csv
input_dir = Path(par["input"])
input_data = {
"count_matrix": input_dir / "cell_feature_matrix.h5",
"cells_metadata": input_dir / "cells.parquet",
"experiment": input_dir / "experiment.xenium",
"metrics_summary": input_dir / "metrics_summary.csv",
}
def _format_cell_id_column(cell_id_column: pd.Series) -> pd.Series:
"""Convert cell IDs to string format, decoding bytes if necessary."""
return cell_id_column.apply(
lambda x: x.decode("utf-8") if isinstance(x, bytes) else str(x)
)
# Read data from Xenium output bundle
logger.info("Reading input data...")
assert all([file.exists() for file in input_data.values()]), (
f"Not all required input files are found. Make sure that {par['input']} contains {input_data.values()}."
)
adata = sc.read_10x_h5(input_data["count_matrix"])
metadata = pd.read_parquet(input_data["cells_metadata"], engine="pyarrow")
with open(input_data["experiment"], "r") as f:
specs = json.load(f)
metrics_summary = pd.read_csv(
input_data["metrics_summary"], decimal=".", quotechar='"', thousands=","
)
# Extract and format required columns
cell_ids = _format_cell_id_column(metadata["cell_id"])
coordinates = metadata[["x_centroid", "y_centroid"]].to_numpy()
metadata.drop(["cell_id", "x_centroid", "y_centroid"], axis=1, inplace=True)
# Updata AnnData with metadata
adata.obs = metadata
adata.obs_names = cell_ids
adata.obsm[par["obsm_coordinates"]] = coordinates
adata.uns[par["uns_experiment"]] = specs
adata.uns[par["uns_metrics"]] = metrics_summary
# Write output MuData
mdata = mu.MuData({"rna": adata})
mdata.write_h5mu(par["output"], compression=par["output_compression"])

View File

@@ -0,0 +1,97 @@
import pytest
import sys
import mudata as mu
## VIASH START
meta = {
"executable": "./target/executable/convert/from_cellranger_multi_to_h5mu/from_cellranger_multi_to_h5mu",
"resources_dir": "resources_test/",
"config": "src/convert/from_cellranger_multi_to_h5mu/config.vsh.yaml",
}
## VIASH END
input = f"{meta['resources_dir']}/xenium_tiny"
def test_simple_execution(run_component, tmp_path):
output = tmp_path / "xenium.h5mu"
# run component
run_component(
["--input", input, "--output", str(output), "--output_compression", "gzip"]
)
assert output.is_file(), "output file was not created"
mdata = mu.read_h5mu(output)
assert list(mdata.mod.keys()) == ["rna"], "Expected modality rna"
adata = mdata.mod["rna"]
assert list(adata.obs.keys()) == [
"transcript_counts",
"control_probe_counts",
"genomic_control_counts",
"control_codeword_counts",
"unassigned_codeword_counts",
"deprecated_codeword_counts",
"total_counts",
"cell_area",
"nucleus_area",
"nucleus_count",
"segmentation_method",
]
assert list(adata.uns.keys()) == ["xenium_experiment", "xenium_metrics"]
assert list(adata.obsm.keys()) == ["spatial"]
assert list(adata.var.keys()) == ["gene_ids", "feature_types", "genome"]
assert adata.X.dtype.kind == "f"
assert all(adata.var["feature_types"] == "Gene Expression")
assert adata.obsm["spatial"].dtype == "float"
obs_counts = [
"transcript_counts",
"control_probe_counts",
"genomic_control_counts",
"unassigned_codeword_counts",
"deprecated_codeword_counts",
"total_counts",
"nucleus_count",
]
assert all([adata.obs[obs].dtype == "int" for obs in obs_counts])
obs_areas = ["cell_area", "nucleus_area"]
assert all([adata.obs[obs].dtype == "float" for obs in obs_areas])
def test_rename_fields(run_component, tmp_path):
output = tmp_path / "xenium.h5mu"
# run component
run_component(
[
"--input",
input,
"--output",
str(output),
"--obsm_coordinates",
"test_coord",
"--uns_experiment",
"test_experiment",
"--uns_metrics",
"test_metrics",
"--output_compression",
"gzip",
]
)
assert output.is_file(), "output file was not created"
mdata = mu.read_h5mu(output)
assert list(mdata.mod.keys()) == ["rna"]
adata = mdata.mod["rna"]
assert list(adata.uns.keys()) == ["test_experiment", "test_metrics"]
assert list(adata.obsm.keys()) == ["test_coord"]
if __name__ == "__main__":
sys.exit(pytest.main([__file__]))

View File

@@ -0,0 +1,99 @@
name: "from_xenium_to_spatialdata"
namespace: "convert"
description: |
Converts the output from 10X Genomics Xenium dataset into a SpatialData objcet.
By default, the following files will be converted:
- `experiment.xenium`: File containing specifications.
- `nucleus_boundaries.parquet`: Polygons of nucleus boundaries.
- `cell_boundaries.parquet`: Polygons of cell boundaries.
- `transcripts.parquet`: File containing transcripts.
- `cell_feature_matrix.h5`: File containing cell feature matrix.
- `cells.parquet`: File containing cell metadata.
- `morphology_mip.ome.tif`: File containing morphology mip.
- `morphology_focus.ome.tif`: File containing morphology focus.
authors:
- __merge__: /src/authors/dorien_roosen.yaml
roles: [ maintainer ]
- __merge__: /src/authors/weiwei_schultz.yaml
roles: [ contributor ]
arguments:
- name: "--input"
alternatives: ["-i"]
type: file
description: Input folder. Must contain the output from a xenium run.
example: xenium_data
direction: input
required: true
- name: "--output"
alternatives: ["-o"]
type: file
description: Zarr directory where the SpatialData object will be stored
example: "xenium_data.zarr"
direction: output
- name: "--cells_boundaries"
type: boolean
default: True
description: Whether to read cell boundaries (polygons).
- name: "--nucleus_boundaries"
type: boolean
default: True
description: Whether to read nucleus boundaries (polygons).
- name: "--cells_as_circles"
type: boolean_true
description: Whether to read cells also as circles (the center and the radius of each circle is computed from the corresponding labels cell).
- name: "--cells_labels"
type: boolean
default: True
description: Whether to read cell labels (raster). The polygonal version of the cell labels are simplified for visualization purposes, and using the raster version is recommended for analysis.
- name: "--transcripts"
type: boolean
default: True
description: Whether to read transcripts.
- name: "--nucleus_labels"
type: boolean
default: True
description: Whether to read nucleus labels (raster). The polygonal version of the nucleus labels are simplified for visualization purposes, and using the raster version is recommended for analysis.
- name: "--morphology_mip"
type: boolean
default: True
description: Whether to read the morphology mip image (available in versions < 2.0.0).
- name: "--morphology_focus"
type: boolean
default: True
description: Whether to read the morphology focus image.
- name: "--aligned_images"
type: boolean
default: True
description: Whether to also parse, when available, additional H&E or IF aligned images. For more control over the aligned images being read, in particular, to specify the axes of the aligned images, please set this parameter to False and use the xenium_aligned_image function directly.
- name: "--cells_table"
type: boolean
default: True
description: Whether to read the cell annotations in the AnnData table.
- name: "--n_jobs"
type: integer
default: 1
resources:
- type: python_script
path: script.py
- path: /src/utils/setup_logger.py
test_resources:
- type: python_script
path: test.py
- path: /resources_test/xenium/xenium_tiny/
engines:
- type: docker
image: python:3.12-slim
setup:
- type: apt
packages:
- procps
- type: python
__merge__: [ /src/base/requirements/spatialdata-io.yaml ]
__merge__: [ /src/base/requirements/python_test_setup.yaml, .]
runners:
- type: executable
- type: nextflow
directives:
label: [lowmem, singlecpu]

View File

@@ -0,0 +1,46 @@
import sys
from spatialdata_io import xenium
## VIASH START
par = {
"input": "./resources_test/xenium_tiny",
"output": "./test/xenium_tiny.zarr",
"cells_boundaries": True,
"nucleus_boundaries": True,
"cells_as_circles": None,
"cells_labels": True,
"nucleus_labels": True,
"transcripts": True,
"morphology_mip": True,
"morphology_focus": True,
"aligned_images": True,
"cells_table": True,
"n_jobs": 1,
}
meta = {"resources_dir": "src/utils"}
## VIASH END
sys.path.append(meta["resources_dir"])
from setup_logger import setup_logger
logger = setup_logger()
logger.info("Reading in Xenium data...")
sdata = xenium(
par["input"],
cells_boundaries=par["cells_boundaries"],
nucleus_boundaries=par["nucleus_boundaries"],
cells_as_circles=par["cells_as_circles"],
cells_labels=par["cells_labels"],
nucleus_labels=par["nucleus_labels"],
transcripts=par["transcripts"],
morphology_mip=par["morphology_mip"], # only available in version < 2.0.0
morphology_focus=par["morphology_focus"],
aligned_images=par["aligned_images"],
cells_table=par["cells_table"],
n_jobs=par["n_jobs"],
)
logger.info("Writing out SpatialData object to Zarr...")
sdata.write(par["output"], overwrite=True)

View File

@@ -0,0 +1,35 @@
import pytest
import os
import sys
import spatialdata as sd
def test_simple_execution(run_component, tmp_path):
output_sd_path = tmp_path / "sd"
run_component(
[
"--input",
meta["resources_dir"] + "/xenium_tiny",
"--output",
output_sd_path,
]
)
assert os.path.exists(output_sd_path), "Output zarr folder was not created"
sdata = sd.read_zarr(output_sd_path)
assert isinstance(sdata, sd.SpatialData), (
"the generated output is not a SpatialData object"
)
assert os.path.exists(output_sd_path / "images"), "images folder was not created"
assert os.path.exists(output_sd_path / "labels"), "labels folder was not created"
assert os.path.exists(output_sd_path / "points"), "images folder was not created"
assert os.path.exists(output_sd_path / "shapes"), "shapes folder was not created"
assert os.path.exists(output_sd_path / "tables"), "tables folder was not created"
assert (output_sd_path / "zmetadata").is_file(), "zmetadata file was not created"
if __name__ == "__main__":
sys.exit(pytest.main([__file__]))

View File

@@ -0,0 +1,75 @@
name: "from_xenium_to_spatialexperiment"
namespace: "convert"
scope: "public"
description: |
Creates a SpatialExperiment object from the downloaded unzipped Xenium Output Bundle directory
for 10x Genomics Xenium spatial gene expression data, and saves it as a SpatialExperiment object.
The constructor assumes the downloaded unzipped Xenium Output Bundle has the following structure:
Mandatory files
· | — cell_feature_matrix.h5
· | — cells.parquet
Optional files, by default added to the metadata() as a list of paths (will be converted to parquet):
· | — transcripts.parquet
· | — cell_boundaries.parquet
· | — nucleus_boundaries.parquet
· | — experiment.xenium
authors:
- __merge__: /src/authors/dorien_roosen.yaml
roles: [ author, maintainer ]
arguments:
- name: "--input"
alternatives: ["-i"]
type: file
description: Input Xenium Output Bundle
direction: input
required: true
example: path/to/xenium_bundle
- name: "--add_experiment_xenium"
type: boolean
default: true
description: Whether to add xenium.experiment parameters to the metadata.
- name: "--add_parquet_paths"
type: boolean
default: true
description: |
Whether to add parquet paths to the metadata.
If True, `transcripts.parquet`, `cell_boundaries.parquet`, `nucleus_boundaries.parquet` will be added to the metadata.
- name: "--alternative_experiment_features"
type: string
multiple: true
description: Feature names containing these strings will be moved to altExps(sxe) slots as separate SpatialExperiment objects.
default: [NegControlProbe, UnassignedCodeword, NegControlCodeword, antisense, BLANK]
- name: "--output"
alternatives: ["-o"]
type: file
description: Output SpatialExperiment file
direction: output
required: true
example: output.rds
resources:
- type: r_script
path: script.R
test_resources:
- type: r_script
path: test.R
- path: /resources_test/xenium/xenium_tiny
engines:
- type: docker
image: rocker/r2u:24.04
setup:
- type: apt
packages:
- libhdf5-dev
- libgeos-dev
- type: r
bioc: [ SpatialExperimentIO ]
test_setup:
- type: r
cran: [ testthat ]
runners:
- type: executable
- type: nextflow
directives:
label: [lowmem, singlecpu]

View File

@@ -0,0 +1,28 @@
library(SpatialExperimentIO)
### VIASH START
par <- list(
input = "resources_test/xenium/xenium_tiny",
add_experiment_xenium = TRUE,
add_parquet_paths = TRUE,
alternative_experiment_features = c(
"NegControlProbe", "UnassignedCodeword",
"NegControlCodeword", "antisense", "BLANK"
),
output = "spe_test.rds"
)
### VIASH END
spe <- readXeniumSXE(
dirName = par$input,
returnType = "SPE",
countMatPattern = "cell_feature_matrix.h5",
metaDataPattern = "cells.parquet",
coordNames = c("x_centroid", "y_centroid"),
addExperimentXenium = par$add_experiment_xenium,
addParquetPaths = par$add_parquet_paths,
altExps = par$alternative_experiment_features
)
saveRDS(spe, file = par$output)

View File

@@ -0,0 +1,111 @@
library(testthat, warn.conflicts = FALSE)
library(SpatialExperimentIO)
library(SpatialExperiment)
## VIASH START
meta <- list(
executable = "./from_xenium_to_spatialexperiment",
resources_dir = "resources_test/xenium",
name = "from_xenium_to_spatial_experiment"
)
## VIASH END
cat("> Checking simple execution\n")
spe <- paste0(
meta[["resources_dir"]],
"/xenium_tiny"
)
out_rds <- "output.rds"
cat("> Running ", meta[["name"]], "\n", sep = "")
out <- processx::run(
meta[["executable"]],
c(
"--input", spe,
"--output", out_rds
)
)
cat("> Checking whether output file exists\n")
expect_equal(out$status, 0)
expect_true(file.exists(out_rds))
cat("> Reading output file\n")
obj <- readRDS(file = out_rds)
cat("> Checking whether Seurat object is in the right format\n")
# Object type
expect_is(obj, "SpatialExperiment")
# Assay structure
expect_equal(names(slot(obj, "assays")), "counts")
# Spatial coordinates
expect_equal(spatialCoordsNames(obj), c("x_centroid", "y_centroid"))
# Alternative experiments
expect_equal(
altExpNames(obj),
c("NegControlProbe", "UnassignedCodeword", "NegControlCodeword")
)
# Metadata components
metadata_components <- c(
"experiment.xenium", "transcripts", "cell_boundaries", "nucleus_boundaries"
)
expect_named(
metadata(obj),
metadata_components,
ignore.order = TRUE
)
# Parquet paths
parquet_components <- c("transcripts", "cell_boundaries", "nucleus_boundaries")
for (component in parquet_components) {
expect_true(grepl("\\.parquet$", metadata(obj)[[component]]))
}
# Dimensions
input <- readXeniumSXE(
dirName = spe,
returnType = "SPE"
)
dim_rds <- dim(obj)
dim_input <- dim(input)
expect_equal(dim_rds, dim_input)
cat("> Checking parameter functionality\n")
out_rds_ext <- "output_ext.rds"
cat("> Running ", meta[["name"]], "\n", sep = "")
out_ext <- processx::run(
meta[["executable"]],
c(
"--input", spe,
"--add_experiment_xenium", FALSE,
"--add_parquet_paths", FALSE,
"--alternative_experiment_features", c("NegControlProbe"),
"--output", out_rds_ext
)
)
cat("> Checking whether output file exists\n")
expect_equal(out_ext$status, 0)
expect_true(file.exists(out_rds_ext))
cat("> Reading output file\n")
obj_ext <- readRDS(file = out_rds_ext)
cat("> Checking whether Seurat object is in the right format\n")
# Object type
expect_is(obj_ext, "SpatialExperiment")
# Assay structure
expect_equal(names(slot(obj_ext, "assays")), "counts")
# Spatial coordinates
expect_equal(spatialCoordsNames(obj_ext), c("x_centroid", "y_centroid"))
# Alternative experiments
expect_equal(altExpNames(obj_ext), c("NegControlProbe"))
# Metadata components
expect_true(length(metadata(obj_ext)) == 0)
dim_rds_ext <- dim(obj_ext)
expect_true(identical(dim_rds_ext[2], dim_input[2]))
expect_false(identical(dim_rds_ext[1], dim_input[1]))

View File

@@ -0,0 +1,71 @@
name: "subset_cosmx"
namespace: "filter"
description: |
Filters the output from NanoString experiment to keep only a subset of the fields of view.
Expected input folder structure:
path/to/dataset/
├── CellComposite/
├── CellLabels/
├── CellOverlay/
├── CompartmentLabels/
├── <dataset_id>_exprMat_file.csv
├── <dataset_id>_fov_positions_file.csv
├── <dataset_id>_metadata_file.csv
└── <dataset_id>_tx_file.csv
authors:
- __merge__: /src/authors/dorien_roosen.yaml
roles: [ maintainer ]
- __merge__: /src/authors/weiwei_schultz.yaml
roles: [ contributor ]
arguments:
- name: "--input"
alternatives: ["-i"]
type: file
description: Input folder. Must contain the output from a NanoString CosMx run.
example: cosmx_data
direction: input
required: true
- name: "--num_fovs"
type: integer
required: true
description: Number of fields of views to keep. Will keep only the first <num_fovs> fields of view.
- name: "--subset_transcripts_file"
type: boolean
default: true
description: Whether to subset the <dataset_id>_tx_file.csv file.
- name: "--subset_polygons_file"
type: boolean
default: true
description: Whether to subset the <dataset_id>_polygons.csv file.
- name: "--output"
alternatives: ["-o"]
type: file
description: The directory where the subset data will be stored.
example: "cosmx_data_tiny"
direction: output
resources:
- type: python_script
path: script.py
- path: /src/utils/setup_logger.py
test_resources:
- type: python_script
path: test.py
- path: /resources_test/cosmx/Lung5_Rep2_tiny/
engines:
- type: docker
image: python:3.12-slim
setup:
- type: apt
packages:
- procps
- type: python
__merge__: [ /src/base/requirements/squidpy.yaml ]
__merge__: [ /src/base/requirements/python_test_setup.yaml, .]
runners:
- type: executable
- type: nextflow
directives:
label: [lowmem, singlecpu]

View File

@@ -0,0 +1,69 @@
import os
import shutil
import pandas as pd
import glob
import sys
## VIASH START
par = {
"input": "./resources_test/cosmx/Lung5_Rep2",
"output": "./resources_test/cosmx/Lung5_Rep2_tiny/",
"subset_transcripts_file": True,
"subset_polygons_file": False,
"num_fovs": 5,
}
meta = {"resources_dir": "src/utils"}
## VIASH END
sys.path.append(meta["resources_dir"])
from setup_logger import setup_logger
logger = setup_logger()
def find_matrix_file(suffix):
pattern = os.path.join(par["input"], f"*{suffix}")
files = glob.glob(pattern)
assert len(files) == 1, (
f"Only one file matching pattern {pattern} should be present"
)
return files[0]
kept_fovs = list(range(1, par["num_fovs"] + 1))
os.makedirs(par["output"], exist_ok=True)
# Images
image_dirs = ["CellComposite", "CellLabels", "CellOverlay", "CompartmentLabels"]
for image_dir in image_dirs:
logger.info(f"Subsetting {image_dir}, keeping fovs {kept_fovs}")
os.makedirs(f"{par['output']}/{image_dir}", exist_ok=True)
for fov in kept_fovs:
fov_str = f"{image_dir}_F{fov:03d}.*"
file_path = glob.glob(os.path.join(par["input"], image_dir, fov_str))
assert len(file_path) == 1
shutil.copy2(file_path[0], os.path.join(par["output"], image_dir))
# Matrices
counts_file = find_matrix_file("exprMat_file.csv")
fov_file = find_matrix_file("fov_positions_file.csv")
meta_file = find_matrix_file("metadata_file.csv")
matrices = [counts_file, fov_file, meta_file]
if par["subset_transcripts_file"]:
tx_file = find_matrix_file("tx_file.csv")
matrices.append(tx_file)
if par["subset_polygons_file"]:
polygons_file = find_matrix_file("polygons.csv")
matrices.append(polygons_file)
for matrix in matrices:
logger.info(f"Subsetting {matrix}, keeping fovs {kept_fovs}")
data = pd.read_csv(matrix)
data_tiny = data[data["fov"].isin(kept_fovs)]
data_tiny.to_csv(os.path.join(par["output"], os.path.basename(matrix)), index=False)

View File

@@ -0,0 +1,48 @@
import os
import sys
import pytest
import pandas as pd
def test_simple_execution(run_component, tmp_path):
output_path = tmp_path / "output"
dataset_id = "Lung5_Rep2"
run_component(
[
"--input",
meta["resources_dir"] + "/Lung5_Rep2_tiny",
"--subset_transcripts_file",
"True",
"--subset_polygons_file",
"False",
"--num_fovs",
"2",
"--output",
output_path,
]
)
assert os.path.exists(output_path), "Output folder was not created"
counts_file = output_path / f"{dataset_id}_exprMat_file.csv"
fov_file = output_path / f"{dataset_id}_fov_positions_file.csv"
meta_file = output_path / f"{dataset_id}_metadata_file.csv"
tx_file = output_path / f"{dataset_id}_tx_file.csv"
matrices = [counts_file, fov_file, meta_file, tx_file]
images = ["CellComposite", "CellLabels", "CellOverlay", "CompartmentLabels"]
for image in images:
assert os.path.exists(output_path / image), f"{image} folder was not created"
assert len(os.listdir(output_path / image)) == 2, (
f"{image} folder should contain 2 files"
)
for matrix in matrices:
assert os.path.exists(matrix), f"{matrix} file was not created"
data = pd.read_csv(matrix)
data["fov"].value_counts().shape[0] == 2, f"{matrix} should contain 2 fovs"
if __name__ == "__main__":
sys.exit(pytest.main([__file__]))

View File

@@ -0,0 +1,208 @@
name: spaceranger_count
namespace: mapping
description: Count gene expression and protein expression reads from a single capture area.
keywords: [spaceranger]
links:
documentation: https://www.10xgenomics.com/support/software/space-ranger/latest/analysis/running-pipelines/space-ranger-count
authors:
- __merge__: /src/authors/jakub_majercik.yaml
roles: [ author ]
argument_groups:
- name: Inputs
arguments:
- type: file
name: --gex_reference
required: true
description: Path of folder containing 10x-compatible reference
example: "/path/to/refdata-gex-GRCh38-2020-A"
- type: file
name: --input
required: true
description: |
Path to a directory containing input FASTQ data. Individual FASTQ files should follow the naming convention of 10x Genomics:
[Sample Name]_S[Sample Number]_L[Lane Number]_[Read Type]_001.fastq.gz
Where:
[Sample Name] is the name assigned during sample preparation/sequencing
S[Sample Number] is the sample index (usually S1, S2, etc.)
L[Lane Number] identifies the sequencing lane (L001, L002, etc.)
[Read Type] will be one of:
R1 - Read 1 (contains the spatial barcode and UMI)
R2 - Read 2 (contains the actual cDNA sequence)
I1 - Index Read 1 (if applicable)
I2 - Index Read 2 (if applicable)
example: "/path/to/fastq_folder"
- type: file
name: --probe_set
required: true
description: CSV file specifying the probe set used
example: "Visium_Human_Transcriptome_Probe_Set_v2.0_GRCh38-2020-A.csv"
- type: file
name: --cytaimage
required: false
description: |
Brightfield image generated by the CytAssist instrument.
When using CytAssist workflow, either this or --image must be provided.
example: "cyta_image.tif"
- type: file
name: --image
required: false
description: |
H&E or fluorescence microscope image in TIFF or JPG format.
Required for standard Visium workflow, optional when using --cytaimage for CytAssist workflow.
example: "brightfield.tif"
- name: Outputs
arguments:
- type: file
name: --output
required: true
direction: output
description: The folder to store the alignment results
example: "/path/to/output"
- name: Slide Information
arguments:
- type: string
name: --slide
description: Visium slide serial number (e.g., 'V10J25-015')
required: false
example: "V10J25-015"
- type: string
name: --area
description: Visium capture area identifier (e.g., 'A1')
required: false
example: "A1"
- type: string
name: --unknown_slide
description: |
Use this option if the slide serial number and area were entered incorrectly on the CytAssist
instrument and the correct values are unknown. Not compatible with --slide, --area, or
--slide-file options
required: false
choices: [visium-1, visium-2, visium-2-large, visium-hd]
- type: file
name: --slidefile
description: Slide design file for offline use
required: false
example: "slide_design.gpr"
- type: boolean_true
name: --override_id
description: Overrides the slide serial number and capture area provided in the Cytassist image metadata
- name: Image Options
arguments:
- type: file
name: --darkimage
description: Multi-channel, dark-background fluorescence image
required: false
example: "fluorescence.tif"
- type: file
name: --colorizedimage
description: Color image representing pre-colored dark-background fluorescence images
required: false
example: "colored_fluorescence.tif"
- type: integer
name: --dapi_index
description: Index of DAPI channel (1-indexed) of fluorescence image
required: false
example: 1
min: 1
- type: double
name: --image_scale
description: Microns per microscope image pixel
required: false
example: 0.65
min: 0.01
max: 10
- type: boolean
name: --reorient_images
default: true
description: Whether to rotate and mirror image to align fiducial pattern
- name: Processing Options
arguments:
- type: boolean
name: --create_bam
required: true
description: Enable or disable BAM file generation
default: true
- type: boolean_true
name: --nosecondary
description: Disable secondary analysis (e.g., clustering)
- type: integer
name: --r1_length
required: false
description: Hard trim the input Read 1 to this length before analysis
min: 1
- type: integer
name: --r2_length
required: false
description: Hard trim the input Read 2 to this length before analysis
min: 1
- type: boolean
name: --filter_probes
default: true
description: Whether to filter the probe set using the "included" column
- type: integer
name: --custom_bin_size
description: Bin Visium HD data to specified size in microns (4-100, even values only) in addition to the standard binning size (2 µm, 8 µm, 16 µm)
min: 4
max: 100
- name: Input Selection
arguments:
- type: string
name: --project
required: false
description: Project folder name within mkfastq output
- type: string
name: --sample
required: false
description: Prefix of FASTQ filenames to select
- type: integer
name: --lanes
multiple: true
required: false
description: Only use FASTQs from selected lanes
example: [1,2,3]
resources:
- type: bash_script
path: script.sh
test_resources:
- type: bash_script
path: test.sh
- path: /resources_test/visium
- path: /resources_test/GRCh38
engines:
- type: docker
image: ghcr.io/data-intuitive/spaceranger:3.1
setup:
- type: docker
run: |
DEBIAN_FRONTEND=noninteractive apt update && \
apt upgrade -y && apt install -y procps && rm -rf /var/lib/apt/lists/*
runners:
- type: executable
- type: nextflow

View File

@@ -0,0 +1,45 @@
#!/bin/bash
set -eo pipefail
unset_if_false=(
par_override_id
par_nosecondary
)
for par in ${unset_if_false[@]}; do
test_val="${!par}"
[[ "$test_val" == "false" ]] && unset $par
done
spaceranger count \
${par_output:+--id="$par_output"} \
${par_gex_reference:+--transcriptome="$par_gex_reference"} \
${par_input:+--fastqs="$par_input"} \
${par_probe_set:+--probe-set="$par_probe_set"} \
${par_cytaimage:+--cytaimage="$par_cytaimage"} \
${par_image:+--image="$par_image"} \
${par_slide:+--slide="$par_slide"} \
${par_area:+--area="$par_area"} \
${par_unknown_slide:+--unknown-slide="$par_unknown_slide"} \
${par_slidefile:+--slidefile="$par_slidefile"} \
${par_override_id:+--override-id} \
${par_darkimage:+--darkimage="$par_darkimage"} \
${par_colorizedimage:+--colorizedimage="$par_colorizedimage"} \
${par_dapi_index:+--dapi-index="$par_dapi_index"} \
${par_image_scale:+--image-scale="$par_image_scale"} \
${par_reorient_images:+--reorient-images="$par_reorient_images"} \
${par_create_bam:+--create-bam="$par_create_bam"} \
${par_nosecondary:+--nosecondary} \
${par_r1_length:+--r1-length="$par_r1_length"} \
${par_r2_length:+--r2-length="$par_r2_length"} \
${par_filter_probes:+--filter-probes="$par_filter_probes"} \
${par_custom_bin_size:+--custom-bin-size="$par_custom_bin_size"} \
${par_project:+--project="$par_project"} \
${par_sample:+--sample="$par_sample"} \
${par_lanes:+--lanes="$par_lanes"} \
${meta_cpus:+--localcores="$meta_cpus"} \
${meta_memory_gb:+--localmem=$(($meta_memory_gb-2))}
mv -f "$par_output"/outs/* "$par_output"/
rm -rf "$par_output"/outs

View File

@@ -0,0 +1,47 @@
#!/bin/bash
set -eo pipefail
## VIASH START
meta_executable="target/native/spaceranger/spaceranger_count/spaceranger_count"
meta_resources_dir="resources_test"
## VIASH END
test_data="$meta_resources_dir/visium"
echo "> Default test run"
"$meta_executable" \
--output test_spaceranger \
--gex_reference "$meta_resources_dir/GRCh38" \
--input "$test_data/subsampled" \
--probe_set "$test_data/Visium_FFPE_Human_Ovarian_Cancer_probe_set.csv" \
--image "$test_data/subsampled/Visium_FFPE_Human_Ovarian_Cancer_image.jpg" \
--unknown_slide visium-1 \
--create_bam false
echo "> Checking outputs..."
# Define output directory
OUT_DIR="test_spaceranger"
# Function to check if file exists and is non-empty
check_file() {
local file=$1
local description=$2
echo -n "Checking $description... "
if [ ! -f "$file" ]; then
echo "FAIL (file not found)"
exit 1
elif [ ! -s "$file" ]; then
echo "FAIL (file is empty)"
exit 1
else
echo "OK"
fi
}
# Check essential files
check_file "$OUT_DIR/web_summary.html" "web summary"
check_file "$OUT_DIR/metrics_summary.csv" "metrics summary"
echo "> All tests passed successfully!"

View File

@@ -0,0 +1,87 @@
import shutil
from anndata import AnnData
from mudata import write_h5ad
from h5py import File as H5File
from h5py import Group, Dataset
from pathlib import Path
from typing import Union, Literal
from functools import partial
def compress_h5mu(
input_path: Union[str, Path],
output_path: Union[str, Path],
compression: Union[Literal["gzip"], Literal["lzf"]],
):
input_path, output_path = str(input_path), str(output_path)
def copy_attributes(in_object, out_object):
for key, value in in_object.attrs.items():
out_object.attrs[key] = value
def visit_path(
output_h5: H5File,
compression: Union[Literal["gzip"], Literal["lzf"]],
name: str,
object: Union[Group, Dataset],
):
if isinstance(object, Group):
new_group = output_h5.create_group(name)
copy_attributes(object, new_group)
elif isinstance(object, Dataset):
# Compression only works for non-scalar Dataset objects
# Scalar objects dont have a shape defined
if not object.compression and object.shape not in [None, ()]:
new_dataset = output_h5.create_dataset(
name, data=object, compression=compression
)
copy_attributes(object, new_dataset)
else:
output_h5.copy(object, name)
else:
raise NotImplementedError(
f"Could not copy element {name}, "
f"type has not been implemented yet: {type(object)}"
)
with (
H5File(input_path, "r") as input_h5,
H5File(output_path, "w", userblock_size=512) as output_h5,
):
copy_attributes(input_h5, output_h5)
input_h5.visititems(partial(visit_path, output_h5, compression))
with open(input_path, "rb") as input_bytes:
# Mudata puts metadata like this in the first 512 bytes:
# MuData (format-version=0.1.0;creator=muon;creator-version=0.2.0)
# See mudata/_core/io.py, read_h5mu() function
starting_metadata = input_bytes.read(100)
# The metadata is padded with extra null bytes up until 512 bytes
truncate_location = starting_metadata.find(b"\x00")
starting_metadata = starting_metadata[:truncate_location]
with open(output_path, "br+") as f:
nbytes = f.write(starting_metadata)
f.write(b"\0" * (512 - nbytes))
def write_h5ad_to_h5mu_with_compression(
output_file: Union[str, Path],
h5mu: Union[str, Path],
modality_name: str,
modality_data: AnnData,
output_compression=None,
):
output_file = Path(output_file)
h5mu = Path(h5mu)
output_file_uncompressed = (
output_file.with_name(output_file.stem + "_uncompressed.h5mu")
if output_compression
else output_file
)
shutil.copyfile(h5mu, output_file_uncompressed)
write_h5ad(filename=output_file_uncompressed, mod=modality_name, data=modality_data)
if output_compression:
compress_h5mu(
output_file_uncompressed, output_file, compression=output_compression
)
output_file_uncompressed.unlink()

12
src/utils/setup_logger.py Normal file
View File

@@ -0,0 +1,12 @@
def setup_logger():
import logging
from sys import stdout
logger = logging.getLogger()
logger.setLevel(logging.INFO)
console_handler = logging.StreamHandler(stdout)
logFormatter = logging.Formatter("%(asctime)s %(levelname)-8s %(message)s")
console_handler.setFormatter(logFormatter)
logger.addHandler(console_handler)
return logger

View File

@@ -0,0 +1,318 @@
name: "spatial_process_samples"
namespace: "workflows/multiomics"
scope: "public"
description: "A pipeline to pre-process multiple spatial omics samples."
authors:
- __merge__: /src/authors/dries_schaumont.yaml
roles: [ author, maintainer ]
- __merge__: /src/authors/dorien_roosen.yaml
roles: [ contributor ]
- __merge__: /src/authors/weiwei_schultz.yaml
roles: [ contributor ]
argument_groups:
- name: Inputs
arguments:
- name: "--id"
required: true
type: string
description: ID of the sample.
example: foo
- name: "--input"
alternatives: [-i]
description: Path to the sample.
required: true
example: input.h5mu
type: file
- name: "--rna_layer"
type: string
description: "Input layer for the gene expression modality. If not specified, .X is used."
required: false
- name: "--prot_layer"
type: string
description: "Input layer for the antibody capture modality. If not specified, .X is used."
required: false
- name: "Outputs"
arguments:
- name: "--output"
type: file
required: true
direction: output
description: Destination path to the output.
example: output.h5mu
- name: "Sample ID options"
description: |
Options for adding the id to .obs on the MuData object. Having a sample
id present in a requirement of several components for this pipeline.
arguments:
- name: "--add_id_to_obs"
description: "Add the value passed with --id to .obs."
type: boolean
default: true
- name: --add_id_obs_output
description: |
.Obs column to add the sample IDs to. Required and only used when
--add_id_to_obs is set to 'true'
type: string
default: "sample_id"
- name: "--add_id_make_observation_keys_unique"
type: boolean
description: |
Join the id to the .obs index (.obs_names).
Only used when --add_id_to_obs is set to 'true'.
default: true
- name: "RNA filtering options"
arguments:
- name: "--rna_min_counts"
example: 200
min: 1
type: integer
description: Minimum number of counts captured per cell.
- name: "--rna_max_counts"
example: 5000000
min: 1
type: integer
description: Maximum number of counts captured per cell.
- name: "--rna_min_genes_per_cell"
type: integer
min: 1
example: 200
description: Minimum of non-zero values per cell.
- name: "--rna_max_genes_per_cell"
example: 1500000
min: 1
type: integer
description: Maximum of non-zero values per cell.
- name: "--rna_min_cells_per_gene"
example: 3
min: 1
type: integer
description: Minimum of non-zero values per gene.
- name: "--rna_min_fraction_mito"
example: 0
min: 0
max: 1
type: double
description: Minimum fraction of UMIs that are mitochondrial.
- name: "--rna_max_fraction_mito"
type: double
min: 0
max: 1
example: 0.2
description: Maximum fraction of UMIs that are mitochondrial.
- name: "--rna_min_fraction_ribo"
example: 0
min: 0
max: 1
type: double
description: Minimum fraction of UMIs that are mitochondrial.
- name: "--rna_max_fraction_ribo"
type: double
min: 0
max: 1
example: 0.2
description: Maximum fraction of UMIs that are mitochondrial.
- name: "Protein filtering options"
arguments:
- name: "--prot_min_counts"
description: Minimum number of counts per cell.
type: integer
min: 1
example: 3
- name: "--prot_max_counts"
description: Minimum number of counts per cell.
type: integer
min: 1
example: 5000000
- name: "--prot_min_proteins_per_cell"
type: integer
min: 1
example: 200
description: Minimum of non-zero values per cell.
- name: "--prot_max_proteins_per_cell"
description: Maximum of non-zero values per cell.
type: integer
min: 1
example: 100000000
- name: "--prot_min_cells_per_protein"
example: 3
min: 1
type: integer
description: Minimum of non-zero values per protein.
- name: "Highly variable features detection"
arguments:
- name: "--highly_variable_features_var_output"
alternatives: ["--filter_with_hvg_var_output"]
required: false
type: string
default: "filter_with_hvg"
description: In which .var slot to store a boolean array corresponding to the highly variable genes.
- name: "--highly_variable_features_obs_batch_key"
alternatives: ["--filter_with_hvg_obs_batch_key"]
type: string
default: "sample_id"
required: false
description: |
If specified, highly-variable genes are selected within each batch separately and merged. This simple
process avoids the selection of batch-specific genes and acts as a lightweight batch correction method.
- name: "Mitochondrial & Ribosomal Gene Detection"
arguments:
- name: "--var_gene_names"
required: false
example: "gene_symbol"
type: string
description: |
.var column name to be used to detect mitochondrial/ribosomal genes instead of .var_names (default if not set).
Gene names matching with the regex value from --mitochondrial_gene_regex or --ribosomal_gene_regex will be
identified as mitochondrial or ribosomal genes, respectively.
- name: "--var_name_mitochondrial_genes"
type: string
required: false
description: |
In which .var slot to store a boolean array corresponding the mitochondrial genes.
- name: "--obs_name_mitochondrial_fraction"
type: string
required: false
description: |
When specified, write the fraction of counts originating from mitochondrial genes
(based on --mitochondrial_gene_regex) to an .obs column with the specified name.
Requires --var_name_mitochondrial_genes.
- name: --mitochondrial_gene_regex
type: string
description: |
Regex string that identifies mitochondrial genes from --var_gene_names.
By default will detect human and mouse mitochondrial genes from a gene symbol.
required: false
default: "^[mM][tT]-"
- name: "--var_name_ribosomal_genes"
type: string
required: false
description: |
In which .var slot to store a boolean array corresponding the ribosomal genes.
- name: "--obs_name_ribosomal_fraction"
type: string
required: false
description: |
When specified, write the fraction of counts originating from ribosomal genes
(based on --ribosomal_gene_regex) to an .obs column with the specified name.
Requires --var_name_ribosomal_genes.
- name: --ribosomal_gene_regex
type: string
description: |
Regex string that identifies ribosomal genes from --var_gene_names.
By default will detect human and mouse ribosomal genes from a gene symbol.
required: false
default: "^[Mm]?[Rr][Pp][LlSs]"
- name: "QC metrics calculation options"
arguments:
- name: "--var_qc_metrics"
description: |
Keys to select a boolean (containing only True or False) column from .var.
For each cell, calculate the proportion of total values for genes which are labeled 'True',
compared to the total sum of the values for all genes. Defaults to the combined values specified for
--var_name_mitochondrial_genes and --highly_variable_features_var_output.
type: string
multiple: True
multiple_sep: ','
required: false
example: "ercc,highly_variable"
- name: "--top_n_vars"
type: integer
description: |
Number of top vars to be used to calculate cumulative proportions.
If not specified, proportions are not calculated. `--top_n_vars 20,50` finds
cumulative proportion to the 20th and 50th most expressed vars.
multiple: true
multiple_sep: ','
required: false
default: [50, 100, 200, 500]
- name: "PCA options"
arguments:
- name: "--pca_overwrite"
type: boolean_true
description: "Allow overwriting slots for PCA output."
- name: "CLR options"
arguments:
- name: "--clr_axis"
type: integer
description: "Axis to perform the CLR transformation on."
default: 0
required: false
- name: "RNA Scaling options"
description: |
Options for enabling scaling of the log-normalized data to unit variance and zero mean.
The scaled data will be output a different layer and representation with reduced dimensions
will be created and stored in addition to the non-scaled data.
arguments:
- name: "--rna_enable_scaling"
description: "Enable scaling for the RNA modality."
type: boolean_true
- name: "--rna_scaling_output_layer"
type: string
default: "scaled"
description: "Output layer where the scaled log-normalized data will be stored."
- name: "--rna_scaling_pca_obsm_output"
type: string
description: |
Name of the .obsm key where the PCA representation of the log-normalized
and scaled data is stored.
default: "scaled_pca"
- name: "--rna_scaling_pca_loadings_varm_output"
type: string
description: |
Name of the .varm key where the PCA loadings of the log-normalized and scaled
data is stored.
default: "scaled_pca_loadings"
- name: "--rna_scaling_pca_variance_uns_output"
type: string
description: |
Name of the .uns key where the variance and variance ratio will be stored as a map.
The map will contain two keys: variance and variance_ratio respectively.
default: "scaled_pca_variance"
- name: "--rna_scaling_umap_obsm_output"
type: string
description:
Name of the .obsm key where the UMAP representation of the log-normalized
and scaled data is stored.
default: "scaled_umap"
- name: "--rna_scaling_max_value"
description: "Clip (truncate) data to this value after scaling. If not specified, do not clip."
required: false
type: double
- name: "--rna_scaling_zero_center"
type: boolean_false
description: If set, omit zero-centering variables, which allows to handle sparse input efficiently."
dependencies:
- name: workflows/multiomics/process_samples
alias: spatial_sample_processing
repository: openpipeline_scrublet
repositories:
- name: openpipeline_scrublet
repo: openpipelines-bio/openpipeline
type: github
tag: disable-scrublet_build
resources:
- type: nextflow_script
path: main.nf
entrypoint: run_wf
test_resources:
- type: nextflow_script
path: test.nf
entrypoint: test_wf
- path: /resources_test/xenium/xenium_tiny.h5mu
runners:
- type: nextflow

View File

@@ -0,0 +1,17 @@
#!/bin/bash
set -eo pipefail
# get the root of the directory
REPO_ROOT=$(git rev-parse --show-toplevel)
# ensure that the command below is run from the root of the repository
cd "$REPO_ROOT"
nextflow \
run . \
-main-script src/workflows/multiomics/spatial_process_samples/test.nf \
-entry test_wf \
-profile docker,no_publish \
-c src/workflows/utils/labels_ci.config \
-c src/workflows/utils/integration_tests.config

View File

@@ -0,0 +1,77 @@
workflow run_wf {
take:
input_ch
main:
output_ch = input_ch
| map { id, state ->
def new_state = [
state.id,
state + ["_meta": ["join_id": id], "workflow_output": state.output]
]
new_state
}
| spatial_sample_processing.run(
fromState: { id, state -> [
"id": id,
"input": state.input,
"rna_layer": state.rna_layer,
"prot_layer": state.prot_layer,
"add_id_to_obs": state.add_id_to_obs,
"add_id_obs_output": state.add_id_obs_output,
"add_id_make_observation_keys_unique": state.add_id_make_observation_keys_unique,
"rna_min_counts": state.rna_min_counts,
"rna_max_counts": state.rna_max_counts,
"rna_min_genes_per_cell": state.rna_min_genes_per_cell,
"rna_max_genes_per_cell": state.rna_max_genes_per_cell,
"rna_min_cells_per_gene": state.rna_min_cells_per_gene,
"rna_min_fraction_mito": state.rna_min_fraction_mito,
"rna_max_fraction_mito": state.rna_max_fraction_mito,
"rna_min_fraction_ribo": state.rna_min_fraction_ribo,
"rna_max_fraction_ribo": state.rna_max_fraction_ribo,
"prot_min_counts": state.prot_min_counts,
"prot_max_counts": state.prot_max_counts,
"prot_min_proteins_per_cell": state.prot_min_proteins_per_cell,
"prot_max_proteins_per_cell": state.prot_max_proteins_per_cell,
"prot_min_cells_per_protein": state.prot_min_cells_per_protein,
"highly_variable_features_var_output": state.highly_variable_features_var_output,
"highly_variable_features_obs_batch_key": state.highly_variable_features_obs_batch_key,
"var_gene_names": state.var_gene_names,
"var_name_mitochondrial_genes": state.var_name_mitochondrial_genes,
"obs_name_mitochondrial_fraction": state.obs_name_mitochondrial_fraction,
"mitochondrial_gene_regex": state.mitochondrial_gene_regex,
"var_name_ribosomal_genes": state.var_name_ribosomal_genes,
"obs_name_ribosomal_fraction": state.obs_name_ribosomal_fraction,
"ribosomal_gene_regex": state.ribosomal_gene_regex,
"var_qc_metrics": state.var_qc_metrics,
"top_n_vars": state.top_n_vars,
"pca_overwrite": state.pca_overwrite,
"clr_axis": state.clr_axis,
"rna_enable_scaling": state.rna_enable_scaling,
"rna_scaling_output_layer": state.rna_scaling_output_layer,
"rna_scaling_pca_obsm_output": state.rna_scaling_pca_obsm_output,
"rna_scaling_pca_loadings_varm_output": state.rna_scaling_pca_loadings_varm_output,
"rna_scaling_pca_variance_uns_output": state.rna_scaling_pca_variance_uns_output,
"rna_scaling_umap_obsm_output": state.rna_scaling_umap_obsm_output,
"rna_scaling_max_value": state.rna_scaling_max_value,
"rna_scaling_zero_center": state.rna_scaling_zero_center,
"output": state.workflow_output
]},
args: [
"skip_scrublet_filtering": "true",
],
toState: [
"output": "output"
]
)
| setState(
[
"_meta": "_meta",
"output": "output"
]
)
emit:
output_ch
}

View File

@@ -0,0 +1,10 @@
manifest {
nextflowVersion = '!>=20.12.1-edge'
}
params {
rootDir = java.nio.file.Paths.get("$projectDir/../../../../").toAbsolutePath().normalize().toString()
}
// include common settings
includeConfig("${params.rootDir}/src/workflows/utils/labels.config")

View File

@@ -0,0 +1,33 @@
nextflow.enable.dsl=2
targetDir = params.rootDir + "/target/nextflow"
include { spatial_process_samples } from targetDir + "/workflows/multiomics/spatial_process_samples/main.nf"
params.resources_test = params.rootDir + "/resources_test"
workflow test_wf {
resources_test = file(params.resources_test)
output_ch = Channel.fromList([
[
id: "xenium",
input: resources_test.resolve("xenium/xenium_tiny.h5mu"),
publish_dir: "foo/",
output: "test.h5mu",
]
])
| map{ state -> [state.id, state] }
| spatial_process_samples
| view { output ->
assert output.size() == 2 : "outputs should contain two elements; [id, file]"
assert output[1].output.toString().endsWith("test.h5mu") : "Output file should be a h5mu file. Found: ${output[1].output}"
"Output: $output"
}
| toSortedList()
| map { output_list ->
assert output_list.size() == 1 : "output channel should contain one event"
assert output_list[0][0] == "merged" : "Output ID should be 'merged'"
}
}

View File

@@ -0,0 +1,174 @@
name: "spatial_qc"
namespace: "workflows/qc"
scope: "public"
description: "A pipeline to add basic qc statistics to a MuData containing spatial data."
authors:
- __merge__: /src/authors/dries_schaumont.yaml
roles: [ author, maintainer ]
- __merge__: /src/authors/dorien_roosen.yaml
roles: [ contributor ]
- __merge__: /src/authors/weiwei_schultz.yaml
roles: [ contributor ]
info:
test_dependencies:
- name: qc_test
namespace: test_workflows/qc
argument_groups:
- name: Inputs
arguments:
- name: "--id"
required: true
type: string
description: ID of the sample.
example: foo
- name: "--input"
alternatives: [-i]
description: Path to the sample.
required: true
example: input.h5mu
type: file
- name: "--modality"
description: Which modality to process.
type: string
default: "rna"
required: false
- name: "--layer"
description: "Use specified layer for calculation of qc metrics. If not specified, adata.X is used."
type: string
example: "raw_counts"
required: false
- name: "Mitochondrial & Ribosomal Gene Detection"
arguments:
- name: "--var_gene_names"
required: false
example: "gene_symbol"
type: string
description: |
.var column name to be used to detect mitochondrial/ribosomal genes instead of .var_names (default if not set).
Gene names matching with the regex value from --mitochondrial_gene_regex or --ribosomal_gene_regex will be
identified as mitochondrial or ribosomal genes, respectively.
- name: "--var_name_mitochondrial_genes"
type: string
required: false
description: |
In which .var slot to store a boolean array corresponding the mitochondrial genes.
- name: "--obs_name_mitochondrial_fraction"
type: string
required: false
description: |
.Obs slot to store the fraction of reads found to be mitochondrial. Defaults to 'fraction_' suffixed by the value of --var_name_mitochondrial_genes
- name: --mitochondrial_gene_regex
type: string
description: |
Regex string that identifies mitochondrial genes from --var_gene_names.
By default will detect human and mouse mitochondrial genes from a gene symbol.
required: false
default: "^[mM][tT]-"
- name: "--var_name_ribosomal_genes"
type: string
required: false
description: |
In which .var slot to store a boolean array corresponding the ribosomal genes.
- name: "--obs_name_ribosomal_fraction"
type: string
required: false
description: |
When specified, write the fraction of counts originating from ribosomal genes
(based on --ribosomal_gene_regex) to an .obs column with the specified name.
Requires --var_name_ribosomal_genes.
- name: --ribosomal_gene_regex
type: string
description: |
Regex string that identifies ribosomal genes from --var_gene_names.
By default will detect human and mouse ribosomal genes from a gene symbol.
required: false
default: "^[Mm]?[Rr][Pp][LlSs]"
- name: "QC metrics calculation options"
arguments:
- name: "--var_qc_metrics"
description: |
Keys to select a boolean (containing only True or False) column from .var.
For each cell, calculate the proportion of total values for genes which are labeled 'True',
compared to the total sum of the values for all genes. Defaults to the value from
--var_name_mitochondrial_genes.
type: string
multiple: True
multiple_sep: ','
required: false
example: "ercc,highly_variable"
- name: "--top_n_vars"
type: integer
description: |
Number of top vars to be used to calculate cumulative proportions.
If not specified, proportions are not calculated. `--top_n_vars 20,50` finds
cumulative proportion to the 20th and 50th most expressed vars.
multiple: true
multiple_sep: ','
required: false
default: [50, 100, 200, 500]
- name: "--output_obs_num_nonzero_vars"
description: |
Name of column in .obs describing, for each observation, the number of stored values
(including explicit zeroes). In other words, the name of the column that counts
for each row the number of columns that contain data.
type: string
required: false
default: "num_nonzero_vars"
- name: "--output_obs_total_counts_vars"
description: |
Name of the column for .obs describing, for each observation (row),
the sum of the stored values in the columns.
type: string
required: false
default: total_counts
- name: "--output_var_num_nonzero_obs"
description: |
Name of column describing, for each feature, the number of stored values
(including explicit zeroes). In other words, the name of the column that counts
for each column the number of rows that contain data.
type: string
required: false
default: "num_nonzero_obs"
- name: "--output_var_total_counts_obs"
description: |
Name of the column in .var describing, for each feature (column),
the sum of the stored values in the rows.
type: string
required: false
default: total_counts
- name: "--output_var_obs_mean"
type: string
description: |
Name of the column in .obs providing the mean of the values in each row.
default: "obs_mean"
required: false
- name: "--output_var_pct_dropout"
type: string
default: "pct_dropout"
description: |
Name of the column in .obs providing for each feature the percentage of
observations the feature does not appear on (i.e. is missing). Same as `--output_var_num_nonzero_obs`
but percentage based.
- name: "Outputs"
arguments:
- name: "--output"
type: file
required: true
direction: output
description: Destination path to the output.
example: output.h5mu
dependencies:
- name: workflows/qc/qc
alias: spatial_qc_workflow
repository: openpipeline
resources:
- type: nextflow_script
path: main.nf
entrypoint: run_wf
test_resources:
- type: nextflow_script
path: test.nf
entrypoint: test_wf
- path: /resources_test/xenium/xenium_tiny.h5mu
runners:
- type: nextflow

View File

@@ -0,0 +1,15 @@
#!/bin/bash
# get the root of the directory
REPO_ROOT=$(git rev-parse --show-toplevel)
# ensure that the command below is run from the root of the repository
cd "$REPO_ROOT"
nextflow \
run . \
-main-script src/workflows/qc/spatial_qc/test.nf \
-entry test_wf \
-profile docker,no_publish \
-c src/workflows/utils/labels_ci.config \
-c src/workflows/utils/integration_tests.config

View File

@@ -0,0 +1,38 @@
workflow run_wf {
take:
input_ch
main:
output_ch = input_ch
| spatial_qc_workflow.run(
fromState: { id, state -> [
"id": id,
"input": state.input,
"modality": state.modality,
"layer": state.layer,
"var_gene_names": state.var_gene_names,
"var_name_mitochondrial_genes": state.var_name_mitochondrial_genes,
"obs_name_mitochondrial_fraction": state.obs_name_mitochondrial_fraction,
"mitochondrial_gene_regex": state.mitochondrial_gene_regex,
"var_name_ribosomal_genes": state.var_name_ribosomal_genes,
"obs_name_ribosomal_fraction": state.obs_name_ribosomal_fraction,
"ribosomal_gene_regex": state.ribosomal_gene_regex,
"var_qc_metrics": state.var_qc_metrics,
"top_n_vars": state.top_n_vars,
"output_obs_num_nonzero_vars": state.output_obs_num_nonzero_vars,
"output_obs_total_counts_vars": state.output_obs_total_counts_vars,
"output_var_num_nonzero_obs": state.output_var_num_nonzero_obs,
"output_var_total_counts_obs": state.output_var_total_counts_obs,
"output_var_obs_mean": state.output_var_obs_mean,
"output_var_pct_dropout": state.output_var_pct_dropout
]},
toState: [
"output": "output"
]
)
| setState(["output"])
emit:
output_ch
}

View File

@@ -0,0 +1,10 @@
manifest {
nextflowVersion = '!>=20.12.1-edge'
}
params {
rootDir = java.nio.file.Paths.get("$projectDir/../../../../").toAbsolutePath().normalize().toString()
}
// include common settings
includeConfig("${params.rootDir}/src/workflows/utils/labels.config")

View File

@@ -0,0 +1,40 @@
nextflow.enable.dsl=2
include { spatial_qc } from params.rootDir + "/target/nextflow/workflows/qc/spatial_qc/main.nf"
params.resources_test = params.rootDir + "/resources_test"
workflow test_wf {
resources_test = file(params.resources_test)
output_ch =
Channel.fromList([
[
id: "xenium_test",
input: resources_test.resolve("xenium/xenium_tiny.h5mu"),
var_name_mitochondrial_genes: "mitochondrial",
var_name_ribosomal_genes: "ribosomal",
]
])
| map { state -> [state.id, state] }
| spatial_qc.run(
toState: { id, output, state -> output + [og_input: state.input] }
)
| view { output ->
assert output.size() == 2 : "Outputs should contain two elements; [id, state]"
// check id
def id = output[0]
assert id.endsWith("_test")
// check output
def state = output[1]
assert state instanceof Map : "State should be a map. Found: ${state}"
assert state.containsKey("output") : "Output should contain key 'output'."
assert state.output.isFile() : "'output' should be a file."
assert state.output.toString().endsWith(".h5mu") : "Output file should end with '.h5mu'. Found: ${state.output}"
}
}

View File

@@ -0,0 +1,36 @@
profiles {
// detect tempdir
tempDir = java.nio.file.Paths.get(
System.getenv('NXF_TEMP') ?:
System.getenv('VIASH_TEMP') ?:
System.getenv('TEMPDIR') ?:
System.getenv('TMPDIR') ?:
'/tmp'
).toAbsolutePath()
mount_temp {
docker.temp = tempDir
podman.temp = tempDir
charliecloud.temp = tempDir
}
no_publish {
process {
withName: '.*' {
publishDir = [
enabled: false
]
}
}
}
docker {
docker.enabled = true
// docker.userEmulation = true
singularity.enabled = false
podman.enabled = false
shifter.enabled = false
charliecloud.enabled = false
}
}

View File

@@ -0,0 +1,68 @@
process {
// Default resources for components that hardly do any processing
memory = { 2.GB * task.attempt }
cpus = 1
// Retry for exit codes that have something to do with memory issues
errorStrategy = { task.exitStatus in 137..140 ? 'retry' : 'terminate' }
maxRetries = 3
maxMemory = null
// CPU resources
withLabel: singlecpu { cpus = 1 }
withLabel: lowcpu { cpus = 4 }
withLabel: midcpu { cpus = 10 }
withLabel: highcpu { cpus = 20 }
// Memory resources
withLabel: lowmem { memory = { get_memory( 50.GB * task.attempt ) } }
withLabel: midmem { memory = { get_memory( 50.GB * task.attempt ) } }
withLabel: highmem { memory = { get_memory( 50.GB * task.attempt ) } }
withLabel: veryhighmem { memory = { get_memory( 75.GB * task.attempt ) } }
// Disk space
// Nextflow apparently can't handle empty directives, i.e.
// withLabel: lowdisk {}
// so for that reason we have to add a dummy directive
withLabel: lowdisk {
dummyDirective = "dummyValue"
}
withLabel: middisk {
dummyDirective = "dummyValue"
}
withLabel: highdisk {
dummyDirective = "dummyValue"
}
withLabel: veryhighdisk {
dummyDirective = "dummyValue"
}
// NOTE: The above labels intentionally do not have an effect by default.
// The user should set the disk space requirements by adding the following
// to the compute environment:
//
// withLabel: lowdisk { disk = { 20.GB * task.attempt } }
// withLabel: middisk { disk = { 100.GB * task.attempt } }
// withLabel: highdisk { disk = { 200.GB * task.attempt } }
// withLabel: veryhighdisk { disk = { 500.GB * task.attempt } }
}
def get_memory(to_compare) {
if (!process.containsKey("maxMemory") || !process.maxMemory) {
return to_compare
}
try {
if (process.containsKey("maxRetries") && process.maxRetries && task.attempt == (process.maxRetries as int)) {
return process.maxMemory
}
else if (to_compare.compareTo(process.maxMemory as nextflow.util.MemoryUnit) == 1) {
return max_memory as nextflow.util.MemoryUnit
}
else {
return to_compare
}
} catch (all) {
println "Error processing memory resources. Please check that process.maxMemory '${process.maxMemory}' and process.maxRetries '${process.maxRetries}' are valid!"
System.exit(1)
}
}

View File

@@ -0,0 +1,105 @@
process {
withLabel: lowmem { memory = 13.Gb }
withLabel: lowcpu { cpus = 4 }
withLabel: midmem { memory = 13.Gb }
withLabel: midcpu { cpus = 4 }
withLabel: highmem { memory = 13.Gb }
withLabel: highcpu { cpus = 4 }
withLabel: veryhighmem { memory = 13.Gb }
withLabel: lowdisk {
disk = {process.disk ? process.disk : null}
}
withLabel: middisk {
disk = {process.disk ? process.disk : null}
}
withLabel: highdisk {
disk = {process.disk ? process.disk : null}
}
withLabel: veryhighdisk {
disk = {process.disk ? process.disk : null}
}
}
env.NUMBA_CACHE_DIR = '/tmp'
trace {
enabled = true
overwrite = true
}
dag {
overwrite = true
}
process.maxForks = 1
profiles {
// detect tempdir
tempDir = java.nio.file.Paths.get(
System.getenv('NXF_TEMP') ?:
System.getenv('VIASH_TEMP') ?:
System.getenv('TEMPDIR') ?:
System.getenv('TMPDIR') ?:
'/tmp'
).toAbsolutePath()
mount_temp {
docker.temp = tempDir
podman.temp = tempDir
charliecloud.temp = tempDir
}
no_publish {
process {
withName: '.*' {
publishDir = [
enabled: false
]
}
}
}
docker {
docker.fixOwnership = true
docker.enabled = true
singularity.enabled = false
podman.enabled = false
shifter.enabled = false
charliecloud.enabled = false
}
local {
// This config is for local processing.
process {
maxMemory = 25.GB
withLabel: verylowcpu { cpus = 2 }
withLabel: lowcpu { cpus = 4 }
withLabel: midcpu { cpus = 6 }
withLabel: highcpu { cpus = 12 }
withLabel: lowmem { memory = { get_memory( 8.GB * task.attempt ) } }
withLabel: midmem { memory = { get_memory( 12.GB * task.attempt ) } }
withLabel: highmem { memory = { get_memory( 20.GB * task.attempt ) } }
}
}
}
def get_memory(to_compare) {
if (!process.containsKey("maxMemory") || !process.maxMemory) {
return to_compare
}
try {
if (process.containsKey("maxRetries") && process.maxRetries && task.attempt == (process.maxRetries as int)) {
return process.maxMemory
}
else if (to_compare.compareTo(process.maxMemory as nextflow.util.MemoryUnit) == 1) {
return max_memory as nextflow.util.MemoryUnit
}
else {
return to_compare
}
} catch (all) {
println "Error processing memory resources. Please check that process.maxMemory '${process.maxMemory}' and process.maxRetries '${process.maxRetries}' are valid!"
System.exit(1)
}
}

0
target/.build.yaml Normal file
View File

View File

@@ -0,0 +1,318 @@
name: "grep_annotation_column"
namespace: "metadata"
version: "2.1.2"
authors:
- name: "Dries Schaumont"
roles:
- "maintainer"
info:
role: "Core Team Member"
links:
email: "dries@data-intuitive.com"
github: "DriesSchaumont"
orcid: "0000-0002-4389-0440"
linkedin: "dries-schaumont"
organizations:
- name: "Data Intuitive"
href: "https://www.data-intuitive.com"
role: "Data Scientist"
argument_groups:
- name: "Inputs"
description: "Arguments related to the input dataset."
arguments:
- type: "file"
name: "--input"
alternatives:
- "-i"
description: "Path to the input .h5mu."
info: null
example:
- "sample_path"
must_exist: true
create_parent: true
required: true
direction: "input"
multiple: false
multiple_sep: ";"
- type: "string"
name: "--input_column"
description: "Column to query. If not specified, use .var_names or .obs_names,\
\ depending on the value of --matrix"
info: null
required: false
direction: "input"
multiple: false
multiple_sep: ";"
- type: "string"
name: "--input_layer"
description: "Input data to use when calculating fraction of observations that\
\ match with the query. \nOnly used when --output_fraction_column is provided.\
\ If not specified, .X is used.\n"
info: null
required: false
direction: "input"
multiple: false
multiple_sep: ";"
- type: "string"
name: "--modality"
description: "Which modality to get the annotation matrix from.\n"
info: null
example:
- "rna"
required: true
direction: "input"
multiple: false
multiple_sep: ";"
- type: "string"
name: "--matrix"
description: "Matrix to fetch the column from that will be searched."
info: null
example:
- "var"
required: false
choices:
- "var"
- "obs"
direction: "input"
multiple: false
multiple_sep: ";"
- name: "Outputs"
description: "Arguments related to how the output will be written."
arguments:
- type: "file"
name: "--output"
alternatives:
- "-o"
info: null
example:
- "output.h5mu"
must_exist: true
create_parent: true
required: false
direction: "output"
multiple: false
multiple_sep: ";"
- type: "string"
name: "--output_compression"
description: "The compression format to be used on the output h5mu object."
info: null
example:
- "gzip"
required: false
choices:
- "gzip"
- "lzf"
direction: "input"
multiple: false
multiple_sep: ";"
- type: "string"
name: "--output_match_column"
description: "Name of the column to write the result to."
info: null
required: true
direction: "input"
multiple: false
multiple_sep: ";"
- type: "string"
name: "--output_fraction_column"
description: "For the opposite axis, name of the column to write the fraction\
\ of \nobservations that matches to the pattern.\n"
info: null
required: false
direction: "input"
multiple: false
multiple_sep: ";"
- name: "Query options"
description: "Options related to the query"
arguments:
- type: "string"
name: "--regex_pattern"
description: "Regex to use to match with the input column."
info: null
example:
- "^[mM][tT]-"
required: true
direction: "input"
multiple: false
multiple_sep: ";"
resources:
- type: "python_script"
path: "script.py"
is_executable: true
- type: "file"
path: "setup_logger.py"
- type: "file"
path: "compress_h5mu.py"
- type: "file"
path: "nextflow_labels.config"
dest: "nextflow_labels.config"
description: "Perform a regex lookup on a column from the annotation matrices .obs\
\ or .var.\nThe annotation matrix can originate from either a modality, or all modalities\
\ (global .var or .obs).\n"
test_resources:
- type: "python_script"
path: "test.py"
is_executable: true
- type: "file"
path: "e18_mouse_brain_fresh_5k_filtered_feature_bc_matrix_subset_unique_obs.h5mu"
info: null
status: "enabled"
scope:
image: "public"
target: "public"
license: "MIT"
links:
repository: "https://github.com/openpipelines-bio/openpipeline"
docker_registry: "ghcr.io"
runners:
- type: "executable"
id: "executable"
docker_setup_strategy: "ifneedbepullelsecachedbuild"
- type: "nextflow"
id: "nextflow"
directives:
label:
- "singlecpu"
- "lowmem"
tag: "$id"
auto:
simplifyInput: true
simplifyOutput: false
transcript: false
publish: false
config:
labels:
mem1gb: "memory = 1000000000.B"
mem2gb: "memory = 2000000000.B"
mem5gb: "memory = 5000000000.B"
mem10gb: "memory = 10000000000.B"
mem20gb: "memory = 20000000000.B"
mem50gb: "memory = 50000000000.B"
mem100gb: "memory = 100000000000.B"
mem200gb: "memory = 200000000000.B"
mem500gb: "memory = 500000000000.B"
mem1tb: "memory = 1000000000000.B"
mem2tb: "memory = 2000000000000.B"
mem5tb: "memory = 5000000000000.B"
mem10tb: "memory = 10000000000000.B"
mem20tb: "memory = 20000000000000.B"
mem50tb: "memory = 50000000000000.B"
mem100tb: "memory = 100000000000000.B"
mem200tb: "memory = 200000000000000.B"
mem500tb: "memory = 500000000000000.B"
mem1gib: "memory = 1073741824.B"
mem2gib: "memory = 2147483648.B"
mem4gib: "memory = 4294967296.B"
mem8gib: "memory = 8589934592.B"
mem16gib: "memory = 17179869184.B"
mem32gib: "memory = 34359738368.B"
mem64gib: "memory = 68719476736.B"
mem128gib: "memory = 137438953472.B"
mem256gib: "memory = 274877906944.B"
mem512gib: "memory = 549755813888.B"
mem1tib: "memory = 1099511627776.B"
mem2tib: "memory = 2199023255552.B"
mem4tib: "memory = 4398046511104.B"
mem8tib: "memory = 8796093022208.B"
mem16tib: "memory = 17592186044416.B"
mem32tib: "memory = 35184372088832.B"
mem64tib: "memory = 70368744177664.B"
mem128tib: "memory = 140737488355328.B"
mem256tib: "memory = 281474976710656.B"
mem512tib: "memory = 562949953421312.B"
cpu1: "cpus = 1"
cpu2: "cpus = 2"
cpu5: "cpus = 5"
cpu10: "cpus = 10"
cpu20: "cpus = 20"
cpu50: "cpus = 50"
cpu100: "cpus = 100"
cpu200: "cpus = 200"
cpu500: "cpus = 500"
cpu1000: "cpus = 1000"
script:
- "includeConfig(\"nextflow_labels.config\")"
debug: false
container: "docker"
engines:
- type: "docker"
id: "docker"
image: "python:3.11-slim"
target_tag: "2.1.0"
namespace_separator: "/"
setup:
- type: "apt"
packages:
- "procps"
interactive: false
- type: "python"
user: false
packages:
- "anndata~=0.11.1"
- "mudata~=0.3.1"
script:
- "exec(\"try:\\n import awkward\\nexcept ModuleNotFoundError:\\n exit(0)\\\
nelse: exit(1)\")"
upgrade: true
test_setup:
- type: "apt"
packages:
- "git"
interactive: false
- type: "python"
user: false
packages:
- "viashpy==0.8.0"
github:
- "openpipelines-bio/core#subdirectory=packages/python/openpipeline_testutils"
upgrade: true
entrypoint: []
cmd: null
build_info:
config: "src/metadata/grep_annotation_column/config.vsh.yaml"
runner: "nextflow"
engine: "docker"
output: "target/nextflow/metadata/grep_annotation_column"
executable: "target/nextflow/metadata/grep_annotation_column/main.nf"
viash_version: "0.9.4"
git_commit: "a0c9522486585774f76416150f8a3291409b5363"
git_remote: "https://github.com/openpipelines-bio/openpipeline"
git_tag: "2.1.1-2-ga0c95224865"
package_config:
name: "openpipeline"
summary: "Best-practice workflows for single-cell multi-omics analyses.\n"
description: "OpenPipelines are extensible single cell analysis pipelines for reproducible\
\ and large-scale single cell processing using [Viash](https://viash.io) and [Nextflow](https://www.nextflow.io/).\n\
\nIn terms of workflows, the following has been made available, but keep in mind\
\ that\nindividual tools and functionality can be executed as standalone components\
\ as well.\n\n * Demultiplexing: conversion of raw sequencing data to FASTQ objects.\n\
\ * Ingestion: Read mapping and generating a count matrix.\n * Single sample\
\ processing: cell filtering and doublet detection.\n * Multisample processing:\
\ Count transformation, normalization, QC metric calulations.\n * Integration:\
\ Clustering, integration and batch correction using single and multimodal methods.\n\
\ * Downstream analysis workflows\n"
info:
test_resources:
- type: "s3"
path: "s3://openpipelines-data"
dest: "resources_test"
viash_version: "0.9.4"
source: "src"
target: "target"
config_mods:
- ".resources += {path: '/src/workflows/utils/labels.config', dest: 'nextflow_labels.config'}\n\
.runners[.type == 'nextflow'].config.script := 'includeConfig(\"nextflow_labels.config\"\
)'"
- ".version := \"2.1.2\""
- ".engines[.type == 'docker'].target_tag := '2.1.0'"
keywords:
- "single-cell"
- "multimodal"
license: "MIT"
organization: "openpipelines-bio"
links:
repository: "https://github.com/openpipelines-bio/openpipeline"
docker_registry: "ghcr.io"
homepage: "https://openpipelines.bio"
documentation: "https://openpipelines.bio/fundamentals"
issue_tracker: "https://github.com/openpipelines-bio/openpipeline/issues"

View File

@@ -0,0 +1,87 @@
import shutil
from anndata import AnnData
from mudata import write_h5ad
from h5py import File as H5File
from h5py import Group, Dataset
from pathlib import Path
from typing import Union, Literal
from functools import partial
def compress_h5mu(
input_path: Union[str, Path],
output_path: Union[str, Path],
compression: Union[Literal["gzip"], Literal["lzf"]],
):
input_path, output_path = str(input_path), str(output_path)
def copy_attributes(in_object, out_object):
for key, value in in_object.attrs.items():
out_object.attrs[key] = value
def visit_path(
output_h5: H5File,
compression: Union[Literal["gzip"], Literal["lzf"]],
name: str,
object: Union[Group, Dataset],
):
if isinstance(object, Group):
new_group = output_h5.create_group(name)
copy_attributes(object, new_group)
elif isinstance(object, Dataset):
# Compression only works for non-scalar Dataset objects
# Scalar objects dont have a shape defined
if not object.compression and object.shape not in [None, ()]:
new_dataset = output_h5.create_dataset(
name, data=object, compression=compression
)
copy_attributes(object, new_dataset)
else:
output_h5.copy(object, name)
else:
raise NotImplementedError(
f"Could not copy element {name}, "
f"type has not been implemented yet: {type(object)}"
)
with (
H5File(input_path, "r") as input_h5,
H5File(output_path, "w", userblock_size=512) as output_h5,
):
copy_attributes(input_h5, output_h5)
input_h5.visititems(partial(visit_path, output_h5, compression))
with open(input_path, "rb") as input_bytes:
# Mudata puts metadata like this in the first 512 bytes:
# MuData (format-version=0.1.0;creator=muon;creator-version=0.2.0)
# See mudata/_core/io.py, read_h5mu() function
starting_metadata = input_bytes.read(100)
# The metadata is padded with extra null bytes up until 512 bytes
truncate_location = starting_metadata.find(b"\x00")
starting_metadata = starting_metadata[:truncate_location]
with open(output_path, "br+") as f:
nbytes = f.write(starting_metadata)
f.write(b"\0" * (512 - nbytes))
def write_h5ad_to_h5mu_with_compression(
output_file: Union[str, Path],
h5mu: Union[str, Path],
modality_name: str,
modality_data: AnnData,
output_compression=None,
):
output_file = Path(output_file)
h5mu = Path(h5mu)
output_file_uncompressed = (
output_file.with_name(output_file.stem + "_uncompressed.h5mu")
if output_compression
else output_file
)
shutil.copyfile(h5mu, output_file_uncompressed)
write_h5ad(filename=output_file_uncompressed, mod=modality_name, data=modality_data)
if output_compression:
compress_h5mu(
output_file_uncompressed, output_file, compression=output_compression
)
output_file_uncompressed.unlink()

View File

@@ -0,0 +1,126 @@
manifest {
name = 'metadata/grep_annotation_column'
mainScript = 'main.nf'
nextflowVersion = '!>=20.12.1-edge'
version = '2.1.2'
description = 'Perform a regex lookup on a column from the annotation matrices .obs or .var.\nThe annotation matrix can originate from either a modality, or all modalities (global .var or .obs).\n'
author = 'Dries Schaumont'
}
process.container = 'nextflow/bash:latest'
// detect tempdir
tempDir = java.nio.file.Paths.get(
System.getenv('NXF_TEMP') ?:
System.getenv('VIASH_TEMP') ?:
System.getenv('TEMPDIR') ?:
System.getenv('TMPDIR') ?:
'/tmp'
).toAbsolutePath()
profiles {
no_publish {
process {
withName: '.*' {
publishDir = [
enabled: false
]
}
}
}
mount_temp {
docker.temp = tempDir
podman.temp = tempDir
charliecloud.temp = tempDir
}
docker {
docker.enabled = true
// docker.userEmulation = true
singularity.enabled = false
podman.enabled = false
shifter.enabled = false
charliecloud.enabled = false
}
singularity {
singularity.enabled = true
singularity.autoMounts = true
docker.enabled = false
podman.enabled = false
shifter.enabled = false
charliecloud.enabled = false
}
podman {
podman.enabled = true
docker.enabled = false
singularity.enabled = false
shifter.enabled = false
charliecloud.enabled = false
}
shifter {
shifter.enabled = true
docker.enabled = false
singularity.enabled = false
podman.enabled = false
charliecloud.enabled = false
}
charliecloud {
charliecloud.enabled = true
docker.enabled = false
singularity.enabled = false
podman.enabled = false
shifter.enabled = false
}
}
process{
withLabel: mem1gb { memory = 1000000000.B }
withLabel: mem2gb { memory = 2000000000.B }
withLabel: mem5gb { memory = 5000000000.B }
withLabel: mem10gb { memory = 10000000000.B }
withLabel: mem20gb { memory = 20000000000.B }
withLabel: mem50gb { memory = 50000000000.B }
withLabel: mem100gb { memory = 100000000000.B }
withLabel: mem200gb { memory = 200000000000.B }
withLabel: mem500gb { memory = 500000000000.B }
withLabel: mem1tb { memory = 1000000000000.B }
withLabel: mem2tb { memory = 2000000000000.B }
withLabel: mem5tb { memory = 5000000000000.B }
withLabel: mem10tb { memory = 10000000000000.B }
withLabel: mem20tb { memory = 20000000000000.B }
withLabel: mem50tb { memory = 50000000000000.B }
withLabel: mem100tb { memory = 100000000000000.B }
withLabel: mem200tb { memory = 200000000000000.B }
withLabel: mem500tb { memory = 500000000000000.B }
withLabel: mem1gib { memory = 1073741824.B }
withLabel: mem2gib { memory = 2147483648.B }
withLabel: mem4gib { memory = 4294967296.B }
withLabel: mem8gib { memory = 8589934592.B }
withLabel: mem16gib { memory = 17179869184.B }
withLabel: mem32gib { memory = 34359738368.B }
withLabel: mem64gib { memory = 68719476736.B }
withLabel: mem128gib { memory = 137438953472.B }
withLabel: mem256gib { memory = 274877906944.B }
withLabel: mem512gib { memory = 549755813888.B }
withLabel: mem1tib { memory = 1099511627776.B }
withLabel: mem2tib { memory = 2199023255552.B }
withLabel: mem4tib { memory = 4398046511104.B }
withLabel: mem8tib { memory = 8796093022208.B }
withLabel: mem16tib { memory = 17592186044416.B }
withLabel: mem32tib { memory = 35184372088832.B }
withLabel: mem64tib { memory = 70368744177664.B }
withLabel: mem128tib { memory = 140737488355328.B }
withLabel: mem256tib { memory = 281474976710656.B }
withLabel: mem512tib { memory = 562949953421312.B }
withLabel: cpu1 { cpus = 1 }
withLabel: cpu2 { cpus = 2 }
withLabel: cpu5 { cpus = 5 }
withLabel: cpu10 { cpus = 10 }
withLabel: cpu20 { cpus = 20 }
withLabel: cpu50 { cpus = 50 }
withLabel: cpu100 { cpus = 100 }
withLabel: cpu200 { cpus = 200 }
withLabel: cpu500 { cpus = 500 }
withLabel: cpu1000 { cpus = 1000 }
}
includeConfig("nextflow_labels.config")

View File

@@ -0,0 +1,66 @@
process {
// Default resources for components that hardly do any processing
memory = { 2.GB * task.attempt }
cpus = 1
// Retry for exit codes that have something to do with memory issues
errorStrategy = { task.exitStatus in 137..140 ? 'retry' : 'terminate' }
maxRetries = 3
maxMemory = null
// CPU resources
withLabel: singlecpu { cpus = 1 }
withLabel: lowcpu { cpus = 4 }
withLabel: midcpu { cpus = 10 }
withLabel: highcpu { cpus = 20 }
// Memory resources
withLabel: lowmem { memory = { get_memory( 4.GB * task.attempt ) } }
withLabel: midmem { memory = { get_memory( 25.GB * task.attempt ) } }
withLabel: highmem { memory = { get_memory( 50.GB * task.attempt ) } }
withLabel: veryhighmem { memory = { get_memory( 75.GB * task.attempt ) } }
// Disk space
withLabel: lowdisk {
disk = {process.disk ? process.disk : null}
}
withLabel: middisk {
disk = {process.disk ? process.disk : null}
}
withLabel: highdisk {
disk = {process.disk ? process.disk : null}
}
withLabel: veryhighdisk {
disk = {process.disk ? process.disk : null}
}
// NOTE: The above labels intentionally do not have an effect by default.
// The user should set the disk space requirements by adding the following
// to the compute environment:
//
// withLabel: lowdisk { disk = { 20.GB * task.attempt } }
// withLabel: middisk { disk = { 100.GB * task.attempt } }
// withLabel: highdisk { disk = { 200.GB * task.attempt } }
// withLabel: veryhighdisk { disk = { 500.GB * task.attempt } }
}
def get_memory(to_compare) {
if (!process.containsKey("maxMemory") || !process.maxMemory) {
return to_compare
}
try {
if (process.containsKey("maxRetries") && process.maxRetries && task.attempt == (process.maxRetries as int)) {
return process.maxMemory
}
else if (to_compare.compareTo(process.maxMemory as nextflow.util.MemoryUnit) == 1) {
return max_memory as nextflow.util.MemoryUnit
}
else {
return to_compare
}
} catch (all) {
println "Error processing memory resources. Please check that process.maxMemory '${process.maxMemory}' and process.maxRetries '${process.maxRetries}' are valid!"
System.exit(1)
}
}

View File

@@ -0,0 +1,21 @@
# Inputs
input: # please fill in - example: "sample_path"
# input_column: "foo"
# input_layer: "foo"
modality: # please fill in - example: "rna"
# matrix: "var"
# Outputs
# output: "$id.$key.output.h5mu"
# output_compression: "gzip"
output_match_column: # please fill in - example: "foo"
# output_fraction_column: "foo"
# Query options
regex_pattern: # please fill in - example: "^[mM][tT]-"
# Nextflow input-output arguments
publish_dir: # please fill in - example: "output/"
# param_list: "my_params.yaml"
# Arguments

View File

@@ -0,0 +1,200 @@
{
"$schema": "http://json-schema.org/draft-07/schema",
"title": "grep_annotation_column",
"description": "Perform a regex lookup on a column from the annotation matrices .obs or .var.\nThe annotation matrix can originate from either a modality, or all modalities (global .var or .obs).\n",
"type": "object",
"definitions": {
"Dataset input": {
"title": "Dataset input",
"type": "object",
"description": "Dataset input using nf-tower \"dataset\" or \"data explorer\". Allows for the input of multiple parameter sets to initialise a Nextflow channel.",
"properties": {
"param_list": {
"description": "Dataset input can either be a list of maps, a csv file, a json file, a yaml file, or simply a yaml blob. The names of the input fields (e.g. csv columns, json keys) need to be an exact match with the workflow input parameters.",
"default": "",
"format": "file-path",
"mimetype": "text/csv",
"pattern": "^\\S+\\.csv$"
}
}
},
"inputs" : {
"title": "Inputs",
"type": "object",
"description": "Arguments related to the input dataset.",
"properties": {
"input": {
"type":
"string",
"description": "Type: `file`, required, example: `sample_path`. Path to the input ",
"help_text": "Type: `file`, required, example: `sample_path`. Path to the input .h5mu."
}
,
"input_column": {
"type":
"string",
"description": "Type: `string`. Column to query",
"help_text": "Type: `string`. Column to query. If not specified, use .var_names or .obs_names, depending on the value of --matrix"
}
,
"input_layer": {
"type":
"string",
"description": "Type: `string`. Input data to use when calculating fraction of observations that match with the query",
"help_text": "Type: `string`. Input data to use when calculating fraction of observations that match with the query. \nOnly used when --output_fraction_column is provided. If not specified, .X is used.\n"
}
,
"modality": {
"type":
"string",
"description": "Type: `string`, required, example: `rna`. Which modality to get the annotation matrix from",
"help_text": "Type: `string`, required, example: `rna`. Which modality to get the annotation matrix from.\n"
}
,
"matrix": {
"type":
"string",
"description": "Type: `string`, example: `var`, choices: ``var`, `obs``. Matrix to fetch the column from that will be searched",
"help_text": "Type: `string`, example: `var`, choices: ``var`, `obs``. Matrix to fetch the column from that will be searched.",
"enum": ["var", "obs"]
}
}
},
"outputs" : {
"title": "Outputs",
"type": "object",
"description": "Arguments related to how the output will be written.",
"properties": {
"output": {
"type":
"string",
"description": "Type: `file`, default: `$id.$key.output.h5mu`, example: `output.h5mu`. ",
"help_text": "Type: `file`, default: `$id.$key.output.h5mu`, example: `output.h5mu`. "
,
"default":"$id.$key.output.h5mu"
}
,
"output_compression": {
"type":
"string",
"description": "Type: `string`, example: `gzip`, choices: ``gzip`, `lzf``. The compression format to be used on the output h5mu object",
"help_text": "Type: `string`, example: `gzip`, choices: ``gzip`, `lzf``. The compression format to be used on the output h5mu object.",
"enum": ["gzip", "lzf"]
}
,
"output_match_column": {
"type":
"string",
"description": "Type: `string`, required. Name of the column to write the result to",
"help_text": "Type: `string`, required. Name of the column to write the result to."
}
,
"output_fraction_column": {
"type":
"string",
"description": "Type: `string`. For the opposite axis, name of the column to write the fraction of \nobservations that matches to the pattern",
"help_text": "Type: `string`. For the opposite axis, name of the column to write the fraction of \nobservations that matches to the pattern.\n"
}
}
},
"query options" : {
"title": "Query options",
"type": "object",
"description": "Options related to the query",
"properties": {
"regex_pattern": {
"type":
"string",
"description": "Type: `string`, required, example: `^[mM][tT]-`. Regex to use to match with the input column",
"help_text": "Type: `string`, required, example: `^[mM][tT]-`. Regex to use to match with the input column."
}
}
},
"nextflow input-output arguments" : {
"title": "Nextflow input-output arguments",
"type": "object",
"description": "Input/output parameters for Nextflow itself. Please note that both publishDir and publish_dir are supported but at least one has to be configured.",
"properties": {
"publish_dir": {
"type":
"string",
"description": "Type: `string`, required, example: `output/`. Path to an output directory",
"help_text": "Type: `string`, required, example: `output/`. Path to an output directory."
}
}
}
},
"allOf": [
{
"$ref": "#/definitions/inputs"
},
{
"$ref": "#/definitions/outputs"
},
{
"$ref": "#/definitions/query options"
},
{
"$ref": "#/definitions/nextflow input-output arguments"
}
]
}

View File

@@ -0,0 +1,12 @@
def setup_logger():
import logging
from sys import stdout
logger = logging.getLogger()
logger.setLevel(logging.INFO)
console_handler = logging.StreamHandler(stdout)
logFormatter = logging.Formatter("%(asctime)s %(levelname)-8s %(message)s")
console_handler.setFormatter(logFormatter)
logger.addHandler(console_handler)
return logger

View File

@@ -0,0 +1,376 @@
name: "calculate_qc_metrics"
namespace: "qc"
version: "2.1.2"
authors:
- name: "Dries Schaumont"
roles:
- "author"
info:
role: "Core Team Member"
links:
email: "dries@data-intuitive.com"
github: "DriesSchaumont"
orcid: "0000-0002-4389-0440"
linkedin: "dries-schaumont"
organizations:
- name: "Data Intuitive"
href: "https://www.data-intuitive.com"
role: "Data Scientist"
argument_groups:
- name: "Inputs"
arguments:
- type: "file"
name: "--input"
description: "Input h5mu file"
info: null
example:
- "input.h5mu"
must_exist: true
create_parent: true
required: true
direction: "input"
multiple: false
multiple_sep: ";"
- type: "string"
name: "--modality"
info: null
default:
- "rna"
required: false
direction: "input"
multiple: false
multiple_sep: ";"
- type: "string"
name: "--layer"
info: null
example:
- "raw_counts"
required: false
direction: "input"
multiple: false
multiple_sep: ";"
- name: "Metrics added to .obs"
arguments:
- type: "string"
name: "--var_qc_metrics"
description: "Keys to select a boolean (containing only True or False) column\
\ from .var.\nFor each cell, calculate the proportion of total values for genes\
\ which are labeled 'True', \ncompared to the total sum of the values for all\
\ genes.\n"
info: null
example:
- "ercc,highly_variable,mitochondrial"
required: false
direction: "input"
multiple: true
multiple_sep: ";"
- type: "boolean"
name: "--var_qc_metrics_fill_na_value"
description: "Fill any 'NA' values found in the columns specified with --var_qc_metrics\
\ to 'True' or 'False'.\nas False.\n"
info: null
required: false
direction: "input"
multiple: false
multiple_sep: ";"
- type: "integer"
name: "--top_n_vars"
description: "Number of top vars to be used to calculate cumulative proportions.\n\
If not specified, proportions are not calculated. `--top_n_vars 20;50` finds\n\
cumulative proportion to the 20th and 50th most expressed vars.\n"
info: null
required: false
direction: "input"
multiple: true
multiple_sep: ";"
- type: "string"
name: "--output_obs_num_nonzero_vars"
description: "Name of column in .obs describing, for each observation, the number\
\ of stored values\n(including explicit zeroes). In other words, the name of\
\ the column that counts\nfor each row the number of columns that contain data.\n"
info: null
default:
- "num_nonzero_vars"
required: false
direction: "input"
multiple: false
multiple_sep: ";"
- type: "string"
name: "--output_obs_total_counts_vars"
description: "Name of the column for .obs describing, for each observation (row),\n\
the sum of the stored values in the columns.\n"
info: null
default:
- "total_counts"
required: false
direction: "input"
multiple: false
multiple_sep: ";"
- name: "Metrics added to .var"
arguments:
- type: "string"
name: "--output_var_num_nonzero_obs"
description: "Name of column describing, for each feature, the number of stored\
\ values\n(including explicit zeroes). In other words, the name of the column\
\ that counts\nfor each column the number of rows that contain data.\n"
info: null
default:
- "num_nonzero_obs"
required: false
direction: "input"
multiple: false
multiple_sep: ";"
- type: "string"
name: "--output_var_total_counts_obs"
description: "Name of the column in .var describing, for each feature (column),\n\
the sum of the stored values in the rows.\n"
info: null
default:
- "total_counts"
required: false
direction: "input"
multiple: false
multiple_sep: ";"
- type: "string"
name: "--output_var_obs_mean"
description: "Name of the column in .obs providing the mean of the values in each\
\ row.\n"
info: null
default:
- "obs_mean"
required: false
direction: "input"
multiple: false
multiple_sep: ";"
- type: "string"
name: "--output_var_pct_dropout"
description: "Name of the column in .obs providing for each feature the percentage\
\ of\nobservations the feature does not appear on (i.e. is missing). Same as\
\ `--num_nonzero_obs`\nbut percentage based.\n"
info: null
default:
- "pct_dropout"
required: false
direction: "input"
multiple: false
multiple_sep: ";"
- name: "Outputs"
arguments:
- type: "file"
name: "--output"
description: "Output h5mu file."
info: null
example:
- "output.h5mu"
must_exist: true
create_parent: true
required: false
direction: "output"
multiple: false
multiple_sep: ";"
- type: "string"
name: "--output_compression"
description: "The compression format to be used on the output h5mu object."
info: null
example:
- "gzip"
required: false
choices:
- "gzip"
- "lzf"
direction: "input"
multiple: false
multiple_sep: ";"
resources:
- type: "python_script"
path: "script.py"
is_executable: true
- type: "file"
path: "setup_logger.py"
- type: "file"
path: "compress_h5mu.py"
- type: "file"
path: "nextflow_labels.config"
dest: "nextflow_labels.config"
description: "Add basic quality control metrics to an .h5mu file.\n\nThe metrics are\
\ comparable to what scanpy.pp.calculate_qc_metrics output,\nalthough they have\
\ slightly different names:\n\nVar metrics (name in this component -> name in scanpy):\n\
\ - pct_dropout -> pct_dropout_by_{expr_type}\n - num_nonzero_obs -> n_cells_by_{expr_type}\n\
\ - obs_mean -> mean_{expr_type}\n - total_counts -> total_{expr_type}\n\n Obs\
\ metrics:\n - num_nonzero_vars -> n_genes_by_{expr_type}\n - pct_{var_qc_metrics}\
\ -> pct_{expr_type}_{qc_var}\n - total_counts_{var_qc_metrics} -> total_{expr_type}_{qc_var}\n\
\ - pct_of_counts_in_top_{top_n_vars}_vars -> pct_{expr_type}_in_top_{n}_{var_type}\n\
\ - total_counts -> total_{expr_type}\n \n"
test_resources:
- type: "python_script"
path: "test.py"
is_executable: true
- type: "file"
path: "pbmc_1k_protein_v3_filtered_feature_bc_matrix.h5mu"
info: null
status: "enabled"
scope:
image: "public"
target: "public"
license: "MIT"
links:
repository: "https://github.com/openpipelines-bio/openpipeline"
docker_registry: "ghcr.io"
runners:
- type: "executable"
id: "executable"
docker_setup_strategy: "ifneedbepullelsecachedbuild"
- type: "nextflow"
id: "nextflow"
directives:
label:
- "singlecpu"
- "midmem"
tag: "$id"
auto:
simplifyInput: true
simplifyOutput: false
transcript: false
publish: false
config:
labels:
mem1gb: "memory = 1000000000.B"
mem2gb: "memory = 2000000000.B"
mem5gb: "memory = 5000000000.B"
mem10gb: "memory = 10000000000.B"
mem20gb: "memory = 20000000000.B"
mem50gb: "memory = 50000000000.B"
mem100gb: "memory = 100000000000.B"
mem200gb: "memory = 200000000000.B"
mem500gb: "memory = 500000000000.B"
mem1tb: "memory = 1000000000000.B"
mem2tb: "memory = 2000000000000.B"
mem5tb: "memory = 5000000000000.B"
mem10tb: "memory = 10000000000000.B"
mem20tb: "memory = 20000000000000.B"
mem50tb: "memory = 50000000000000.B"
mem100tb: "memory = 100000000000000.B"
mem200tb: "memory = 200000000000000.B"
mem500tb: "memory = 500000000000000.B"
mem1gib: "memory = 1073741824.B"
mem2gib: "memory = 2147483648.B"
mem4gib: "memory = 4294967296.B"
mem8gib: "memory = 8589934592.B"
mem16gib: "memory = 17179869184.B"
mem32gib: "memory = 34359738368.B"
mem64gib: "memory = 68719476736.B"
mem128gib: "memory = 137438953472.B"
mem256gib: "memory = 274877906944.B"
mem512gib: "memory = 549755813888.B"
mem1tib: "memory = 1099511627776.B"
mem2tib: "memory = 2199023255552.B"
mem4tib: "memory = 4398046511104.B"
mem8tib: "memory = 8796093022208.B"
mem16tib: "memory = 17592186044416.B"
mem32tib: "memory = 35184372088832.B"
mem64tib: "memory = 70368744177664.B"
mem128tib: "memory = 140737488355328.B"
mem256tib: "memory = 281474976710656.B"
mem512tib: "memory = 562949953421312.B"
cpu1: "cpus = 1"
cpu2: "cpus = 2"
cpu5: "cpus = 5"
cpu10: "cpus = 10"
cpu20: "cpus = 20"
cpu50: "cpus = 50"
cpu100: "cpus = 100"
cpu200: "cpus = 200"
cpu500: "cpus = 500"
cpu1000: "cpus = 1000"
script:
- "includeConfig(\"nextflow_labels.config\")"
debug: false
container: "docker"
engines:
- type: "docker"
id: "docker"
image: "python:3.11-slim"
target_tag: "2.1.0"
namespace_separator: "/"
setup:
- type: "apt"
packages:
- "procps"
interactive: false
- type: "python"
user: false
packages:
- "anndata~=0.11.1"
- "mudata~=0.3.1"
- "scipy"
script:
- "exec(\"try:\\n import awkward\\nexcept ModuleNotFoundError:\\n exit(0)\\\
nelse: exit(1)\")"
upgrade: true
test_setup:
- type: "apt"
packages:
- "git"
interactive: false
- type: "python"
user: false
packages:
- "viashpy==0.8.0"
github:
- "openpipelines-bio/core#subdirectory=packages/python/openpipeline_testutils"
upgrade: true
- type: "python"
user: false
packages:
- "scanpy"
upgrade: true
entrypoint: []
cmd: null
build_info:
config: "src/qc/calculate_qc_metrics/config.vsh.yaml"
runner: "nextflow"
engine: "docker"
output: "target/nextflow/qc/calculate_qc_metrics"
executable: "target/nextflow/qc/calculate_qc_metrics/main.nf"
viash_version: "0.9.4"
git_commit: "a0c9522486585774f76416150f8a3291409b5363"
git_remote: "https://github.com/openpipelines-bio/openpipeline"
git_tag: "2.1.1-2-ga0c95224865"
package_config:
name: "openpipeline"
summary: "Best-practice workflows for single-cell multi-omics analyses.\n"
description: "OpenPipelines are extensible single cell analysis pipelines for reproducible\
\ and large-scale single cell processing using [Viash](https://viash.io) and [Nextflow](https://www.nextflow.io/).\n\
\nIn terms of workflows, the following has been made available, but keep in mind\
\ that\nindividual tools and functionality can be executed as standalone components\
\ as well.\n\n * Demultiplexing: conversion of raw sequencing data to FASTQ objects.\n\
\ * Ingestion: Read mapping and generating a count matrix.\n * Single sample\
\ processing: cell filtering and doublet detection.\n * Multisample processing:\
\ Count transformation, normalization, QC metric calulations.\n * Integration:\
\ Clustering, integration and batch correction using single and multimodal methods.\n\
\ * Downstream analysis workflows\n"
info:
test_resources:
- type: "s3"
path: "s3://openpipelines-data"
dest: "resources_test"
viash_version: "0.9.4"
source: "src"
target: "target"
config_mods:
- ".resources += {path: '/src/workflows/utils/labels.config', dest: 'nextflow_labels.config'}\n\
.runners[.type == 'nextflow'].config.script := 'includeConfig(\"nextflow_labels.config\"\
)'"
- ".version := \"2.1.2\""
- ".engines[.type == 'docker'].target_tag := '2.1.0'"
keywords:
- "single-cell"
- "multimodal"
license: "MIT"
organization: "openpipelines-bio"
links:
repository: "https://github.com/openpipelines-bio/openpipeline"
docker_registry: "ghcr.io"
homepage: "https://openpipelines.bio"
documentation: "https://openpipelines.bio/fundamentals"
issue_tracker: "https://github.com/openpipelines-bio/openpipeline/issues"

View File

@@ -0,0 +1,87 @@
import shutil
from anndata import AnnData
from mudata import write_h5ad
from h5py import File as H5File
from h5py import Group, Dataset
from pathlib import Path
from typing import Union, Literal
from functools import partial
def compress_h5mu(
input_path: Union[str, Path],
output_path: Union[str, Path],
compression: Union[Literal["gzip"], Literal["lzf"]],
):
input_path, output_path = str(input_path), str(output_path)
def copy_attributes(in_object, out_object):
for key, value in in_object.attrs.items():
out_object.attrs[key] = value
def visit_path(
output_h5: H5File,
compression: Union[Literal["gzip"], Literal["lzf"]],
name: str,
object: Union[Group, Dataset],
):
if isinstance(object, Group):
new_group = output_h5.create_group(name)
copy_attributes(object, new_group)
elif isinstance(object, Dataset):
# Compression only works for non-scalar Dataset objects
# Scalar objects dont have a shape defined
if not object.compression and object.shape not in [None, ()]:
new_dataset = output_h5.create_dataset(
name, data=object, compression=compression
)
copy_attributes(object, new_dataset)
else:
output_h5.copy(object, name)
else:
raise NotImplementedError(
f"Could not copy element {name}, "
f"type has not been implemented yet: {type(object)}"
)
with (
H5File(input_path, "r") as input_h5,
H5File(output_path, "w", userblock_size=512) as output_h5,
):
copy_attributes(input_h5, output_h5)
input_h5.visititems(partial(visit_path, output_h5, compression))
with open(input_path, "rb") as input_bytes:
# Mudata puts metadata like this in the first 512 bytes:
# MuData (format-version=0.1.0;creator=muon;creator-version=0.2.0)
# See mudata/_core/io.py, read_h5mu() function
starting_metadata = input_bytes.read(100)
# The metadata is padded with extra null bytes up until 512 bytes
truncate_location = starting_metadata.find(b"\x00")
starting_metadata = starting_metadata[:truncate_location]
with open(output_path, "br+") as f:
nbytes = f.write(starting_metadata)
f.write(b"\0" * (512 - nbytes))
def write_h5ad_to_h5mu_with_compression(
output_file: Union[str, Path],
h5mu: Union[str, Path],
modality_name: str,
modality_data: AnnData,
output_compression=None,
):
output_file = Path(output_file)
h5mu = Path(h5mu)
output_file_uncompressed = (
output_file.with_name(output_file.stem + "_uncompressed.h5mu")
if output_compression
else output_file
)
shutil.copyfile(h5mu, output_file_uncompressed)
write_h5ad(filename=output_file_uncompressed, mod=modality_name, data=modality_data)
if output_compression:
compress_h5mu(
output_file_uncompressed, output_file, compression=output_compression
)
output_file_uncompressed.unlink()

View File

@@ -0,0 +1,126 @@
manifest {
name = 'qc/calculate_qc_metrics'
mainScript = 'main.nf'
nextflowVersion = '!>=20.12.1-edge'
version = '2.1.2'
description = 'Add basic quality control metrics to an .h5mu file.\n\nThe metrics are comparable to what scanpy.pp.calculate_qc_metrics output,\nalthough they have slightly different names:\n\nVar metrics (name in this component -> name in scanpy):\n - pct_dropout -> pct_dropout_by_{expr_type}\n - num_nonzero_obs -> n_cells_by_{expr_type}\n - obs_mean -> mean_{expr_type}\n - total_counts -> total_{expr_type}\n\n Obs metrics:\n - num_nonzero_vars -> n_genes_by_{expr_type}\n - pct_{var_qc_metrics} -> pct_{expr_type}_{qc_var}\n - total_counts_{var_qc_metrics} -> total_{expr_type}_{qc_var}\n - pct_of_counts_in_top_{top_n_vars}_vars -> pct_{expr_type}_in_top_{n}_{var_type}\n - total_counts -> total_{expr_type}\n \n'
author = 'Dries Schaumont'
}
process.container = 'nextflow/bash:latest'
// detect tempdir
tempDir = java.nio.file.Paths.get(
System.getenv('NXF_TEMP') ?:
System.getenv('VIASH_TEMP') ?:
System.getenv('TEMPDIR') ?:
System.getenv('TMPDIR') ?:
'/tmp'
).toAbsolutePath()
profiles {
no_publish {
process {
withName: '.*' {
publishDir = [
enabled: false
]
}
}
}
mount_temp {
docker.temp = tempDir
podman.temp = tempDir
charliecloud.temp = tempDir
}
docker {
docker.enabled = true
// docker.userEmulation = true
singularity.enabled = false
podman.enabled = false
shifter.enabled = false
charliecloud.enabled = false
}
singularity {
singularity.enabled = true
singularity.autoMounts = true
docker.enabled = false
podman.enabled = false
shifter.enabled = false
charliecloud.enabled = false
}
podman {
podman.enabled = true
docker.enabled = false
singularity.enabled = false
shifter.enabled = false
charliecloud.enabled = false
}
shifter {
shifter.enabled = true
docker.enabled = false
singularity.enabled = false
podman.enabled = false
charliecloud.enabled = false
}
charliecloud {
charliecloud.enabled = true
docker.enabled = false
singularity.enabled = false
podman.enabled = false
shifter.enabled = false
}
}
process{
withLabel: mem1gb { memory = 1000000000.B }
withLabel: mem2gb { memory = 2000000000.B }
withLabel: mem5gb { memory = 5000000000.B }
withLabel: mem10gb { memory = 10000000000.B }
withLabel: mem20gb { memory = 20000000000.B }
withLabel: mem50gb { memory = 50000000000.B }
withLabel: mem100gb { memory = 100000000000.B }
withLabel: mem200gb { memory = 200000000000.B }
withLabel: mem500gb { memory = 500000000000.B }
withLabel: mem1tb { memory = 1000000000000.B }
withLabel: mem2tb { memory = 2000000000000.B }
withLabel: mem5tb { memory = 5000000000000.B }
withLabel: mem10tb { memory = 10000000000000.B }
withLabel: mem20tb { memory = 20000000000000.B }
withLabel: mem50tb { memory = 50000000000000.B }
withLabel: mem100tb { memory = 100000000000000.B }
withLabel: mem200tb { memory = 200000000000000.B }
withLabel: mem500tb { memory = 500000000000000.B }
withLabel: mem1gib { memory = 1073741824.B }
withLabel: mem2gib { memory = 2147483648.B }
withLabel: mem4gib { memory = 4294967296.B }
withLabel: mem8gib { memory = 8589934592.B }
withLabel: mem16gib { memory = 17179869184.B }
withLabel: mem32gib { memory = 34359738368.B }
withLabel: mem64gib { memory = 68719476736.B }
withLabel: mem128gib { memory = 137438953472.B }
withLabel: mem256gib { memory = 274877906944.B }
withLabel: mem512gib { memory = 549755813888.B }
withLabel: mem1tib { memory = 1099511627776.B }
withLabel: mem2tib { memory = 2199023255552.B }
withLabel: mem4tib { memory = 4398046511104.B }
withLabel: mem8tib { memory = 8796093022208.B }
withLabel: mem16tib { memory = 17592186044416.B }
withLabel: mem32tib { memory = 35184372088832.B }
withLabel: mem64tib { memory = 70368744177664.B }
withLabel: mem128tib { memory = 140737488355328.B }
withLabel: mem256tib { memory = 281474976710656.B }
withLabel: mem512tib { memory = 562949953421312.B }
withLabel: cpu1 { cpus = 1 }
withLabel: cpu2 { cpus = 2 }
withLabel: cpu5 { cpus = 5 }
withLabel: cpu10 { cpus = 10 }
withLabel: cpu20 { cpus = 20 }
withLabel: cpu50 { cpus = 50 }
withLabel: cpu100 { cpus = 100 }
withLabel: cpu200 { cpus = 200 }
withLabel: cpu500 { cpus = 500 }
withLabel: cpu1000 { cpus = 1000 }
}
includeConfig("nextflow_labels.config")

View File

@@ -0,0 +1,66 @@
process {
// Default resources for components that hardly do any processing
memory = { 2.GB * task.attempt }
cpus = 1
// Retry for exit codes that have something to do with memory issues
errorStrategy = { task.exitStatus in 137..140 ? 'retry' : 'terminate' }
maxRetries = 3
maxMemory = null
// CPU resources
withLabel: singlecpu { cpus = 1 }
withLabel: lowcpu { cpus = 4 }
withLabel: midcpu { cpus = 10 }
withLabel: highcpu { cpus = 20 }
// Memory resources
withLabel: lowmem { memory = { get_memory( 4.GB * task.attempt ) } }
withLabel: midmem { memory = { get_memory( 25.GB * task.attempt ) } }
withLabel: highmem { memory = { get_memory( 50.GB * task.attempt ) } }
withLabel: veryhighmem { memory = { get_memory( 75.GB * task.attempt ) } }
// Disk space
withLabel: lowdisk {
disk = {process.disk ? process.disk : null}
}
withLabel: middisk {
disk = {process.disk ? process.disk : null}
}
withLabel: highdisk {
disk = {process.disk ? process.disk : null}
}
withLabel: veryhighdisk {
disk = {process.disk ? process.disk : null}
}
// NOTE: The above labels intentionally do not have an effect by default.
// The user should set the disk space requirements by adding the following
// to the compute environment:
//
// withLabel: lowdisk { disk = { 20.GB * task.attempt } }
// withLabel: middisk { disk = { 100.GB * task.attempt } }
// withLabel: highdisk { disk = { 200.GB * task.attempt } }
// withLabel: veryhighdisk { disk = { 500.GB * task.attempt } }
}
def get_memory(to_compare) {
if (!process.containsKey("maxMemory") || !process.maxMemory) {
return to_compare
}
try {
if (process.containsKey("maxRetries") && process.maxRetries && task.attempt == (process.maxRetries as int)) {
return process.maxMemory
}
else if (to_compare.compareTo(process.maxMemory as nextflow.util.MemoryUnit) == 1) {
return max_memory as nextflow.util.MemoryUnit
}
else {
return to_compare
}
} catch (all) {
println "Error processing memory resources. Please check that process.maxMemory '${process.maxMemory}' and process.maxRetries '${process.maxRetries}' are valid!"
System.exit(1)
}
}

View File

@@ -0,0 +1,27 @@
# Inputs
input: # please fill in - example: "input.h5mu"
modality: "rna"
# layer: "raw_counts"
# Metrics added to .obs
# var_qc_metrics: ["ercc,highly_variable,mitochondrial"]
# var_qc_metrics_fill_na_value: true
# top_n_vars: [123]
output_obs_num_nonzero_vars: "num_nonzero_vars"
output_obs_total_counts_vars: "total_counts"
# Metrics added to .var
output_var_num_nonzero_obs: "num_nonzero_obs"
output_var_total_counts_obs: "total_counts"
output_var_obs_mean: "obs_mean"
output_var_pct_dropout: "pct_dropout"
# Outputs
# output: "$id.$key.output.h5mu"
# output_compression: "gzip"
# Nextflow input-output arguments
publish_dir: # please fill in - example: "output/"
# param_list: "my_params.yaml"
# Arguments

View File

@@ -0,0 +1,259 @@
{
"$schema": "http://json-schema.org/draft-07/schema",
"title": "calculate_qc_metrics",
"description": "Add basic quality control metrics to an .h5mu file.\n\nThe metrics are comparable to what scanpy.pp.calculate_qc_metrics output,\nalthough they have slightly different names:\n\nVar metrics (name in this component -\u003e name in scanpy):\n - pct_dropout -\u003e pct_dropout_by_{expr_type}\n - num_nonzero_obs -\u003e n_cells_by_{expr_type}\n - obs_mean -\u003e mean_{expr_type}\n - total_counts -\u003e total_{expr_type}\n\n Obs metrics:\n - num_nonzero_vars -\u003e n_genes_by_{expr_type}\n - pct_{var_qc_metrics} -\u003e pct_{expr_type}_{qc_var}\n - total_counts_{var_qc_metrics} -\u003e total_{expr_type}_{qc_var}\n - pct_of_counts_in_top_{top_n_vars}_vars -\u003e pct_{expr_type}_in_top_{n}_{var_type}\n - total_counts -\u003e total_{expr_type}\n \n",
"type": "object",
"definitions": {
"Dataset input": {
"title": "Dataset input",
"type": "object",
"description": "Dataset input using nf-tower \"dataset\" or \"data explorer\". Allows for the input of multiple parameter sets to initialise a Nextflow channel.",
"properties": {
"param_list": {
"description": "Dataset input can either be a list of maps, a csv file, a json file, a yaml file, or simply a yaml blob. The names of the input fields (e.g. csv columns, json keys) need to be an exact match with the workflow input parameters.",
"default": "",
"format": "file-path",
"mimetype": "text/csv",
"pattern": "^\\S+\\.csv$"
}
}
},
"inputs" : {
"title": "Inputs",
"type": "object",
"description": "No description",
"properties": {
"input": {
"type":
"string",
"description": "Type: `file`, required, example: `input.h5mu`. Input h5mu file",
"help_text": "Type: `file`, required, example: `input.h5mu`. Input h5mu file"
}
,
"modality": {
"type":
"string",
"description": "Type: `string`, default: `rna`. ",
"help_text": "Type: `string`, default: `rna`. "
,
"default":"rna"
}
,
"layer": {
"type":
"string",
"description": "Type: `string`, example: `raw_counts`. ",
"help_text": "Type: `string`, example: `raw_counts`. "
}
}
},
"outputs" : {
"title": "Outputs",
"type": "object",
"description": "No description",
"properties": {
"output": {
"type":
"string",
"description": "Type: `file`, default: `$id.$key.output.h5mu`, example: `output.h5mu`. Output h5mu file",
"help_text": "Type: `file`, default: `$id.$key.output.h5mu`, example: `output.h5mu`. Output h5mu file."
,
"default":"$id.$key.output.h5mu"
}
,
"output_compression": {
"type":
"string",
"description": "Type: `string`, example: `gzip`, choices: ``gzip`, `lzf``. The compression format to be used on the output h5mu object",
"help_text": "Type: `string`, example: `gzip`, choices: ``gzip`, `lzf``. The compression format to be used on the output h5mu object.",
"enum": ["gzip", "lzf"]
}
}
},
"metrics added to .obs" : {
"title": "Metrics added to .obs",
"type": "object",
"description": "No description",
"properties": {
"var_qc_metrics": {
"type":
"string",
"description": "Type: List of `string`, example: `ercc,highly_variable,mitochondrial`, multiple_sep: `\";\"`. Keys to select a boolean (containing only True or False) column from ",
"help_text": "Type: List of `string`, example: `ercc,highly_variable,mitochondrial`, multiple_sep: `\";\"`. Keys to select a boolean (containing only True or False) column from .var.\nFor each cell, calculate the proportion of total values for genes which are labeled \u0027True\u0027, \ncompared to the total sum of the values for all genes.\n"
}
,
"var_qc_metrics_fill_na_value": {
"type":
"boolean",
"description": "Type: `boolean`. Fill any \u0027NA\u0027 values found in the columns specified with --var_qc_metrics to \u0027True\u0027 or \u0027False\u0027",
"help_text": "Type: `boolean`. Fill any \u0027NA\u0027 values found in the columns specified with --var_qc_metrics to \u0027True\u0027 or \u0027False\u0027.\nas False.\n"
}
,
"top_n_vars": {
"type":
"string",
"description": "Type: List of `integer`, multiple_sep: `\";\"`. Number of top vars to be used to calculate cumulative proportions",
"help_text": "Type: List of `integer`, multiple_sep: `\";\"`. Number of top vars to be used to calculate cumulative proportions.\nIf not specified, proportions are not calculated. `--top_n_vars 20;50` finds\ncumulative proportion to the 20th and 50th most expressed vars.\n"
}
,
"output_obs_num_nonzero_vars": {
"type":
"string",
"description": "Type: `string`, default: `num_nonzero_vars`. Name of column in ",
"help_text": "Type: `string`, default: `num_nonzero_vars`. Name of column in .obs describing, for each observation, the number of stored values\n(including explicit zeroes). In other words, the name of the column that counts\nfor each row the number of columns that contain data.\n"
,
"default":"num_nonzero_vars"
}
,
"output_obs_total_counts_vars": {
"type":
"string",
"description": "Type: `string`, default: `total_counts`. Name of the column for ",
"help_text": "Type: `string`, default: `total_counts`. Name of the column for .obs describing, for each observation (row),\nthe sum of the stored values in the columns.\n"
,
"default":"total_counts"
}
}
},
"metrics added to .var" : {
"title": "Metrics added to .var",
"type": "object",
"description": "No description",
"properties": {
"output_var_num_nonzero_obs": {
"type":
"string",
"description": "Type: `string`, default: `num_nonzero_obs`. Name of column describing, for each feature, the number of stored values\n(including explicit zeroes)",
"help_text": "Type: `string`, default: `num_nonzero_obs`. Name of column describing, for each feature, the number of stored values\n(including explicit zeroes). In other words, the name of the column that counts\nfor each column the number of rows that contain data.\n"
,
"default":"num_nonzero_obs"
}
,
"output_var_total_counts_obs": {
"type":
"string",
"description": "Type: `string`, default: `total_counts`. Name of the column in ",
"help_text": "Type: `string`, default: `total_counts`. Name of the column in .var describing, for each feature (column),\nthe sum of the stored values in the rows.\n"
,
"default":"total_counts"
}
,
"output_var_obs_mean": {
"type":
"string",
"description": "Type: `string`, default: `obs_mean`. Name of the column in ",
"help_text": "Type: `string`, default: `obs_mean`. Name of the column in .obs providing the mean of the values in each row.\n"
,
"default":"obs_mean"
}
,
"output_var_pct_dropout": {
"type":
"string",
"description": "Type: `string`, default: `pct_dropout`. Name of the column in ",
"help_text": "Type: `string`, default: `pct_dropout`. Name of the column in .obs providing for each feature the percentage of\nobservations the feature does not appear on (i.e. is missing). Same as `--num_nonzero_obs`\nbut percentage based.\n"
,
"default":"pct_dropout"
}
}
},
"nextflow input-output arguments" : {
"title": "Nextflow input-output arguments",
"type": "object",
"description": "Input/output parameters for Nextflow itself. Please note that both publishDir and publish_dir are supported but at least one has to be configured.",
"properties": {
"publish_dir": {
"type":
"string",
"description": "Type: `string`, required, example: `output/`. Path to an output directory",
"help_text": "Type: `string`, required, example: `output/`. Path to an output directory."
}
}
}
},
"allOf": [
{
"$ref": "#/definitions/inputs"
},
{
"$ref": "#/definitions/outputs"
},
{
"$ref": "#/definitions/metrics added to .obs"
},
{
"$ref": "#/definitions/metrics added to .var"
},
{
"$ref": "#/definitions/nextflow input-output arguments"
}
]
}

View File

@@ -0,0 +1,12 @@
def setup_logger():
import logging
from sys import stdout
logger = logging.getLogger()
logger.setLevel(logging.INFO)
console_handler = logging.StreamHandler(stdout)
logFormatter = logging.Formatter("%(asctime)s %(levelname)-8s %(message)s")
console_handler.setFormatter(logFormatter)
logger.addHandler(console_handler)
return logger

View File

@@ -0,0 +1,406 @@
name: "qc"
namespace: "workflows/qc"
version: "2.1.2"
authors:
- name: "Dries Schaumont"
roles:
- "author"
- "maintainer"
info:
role: "Core Team Member"
links:
email: "dries@data-intuitive.com"
github: "DriesSchaumont"
orcid: "0000-0002-4389-0440"
linkedin: "dries-schaumont"
organizations:
- name: "Data Intuitive"
href: "https://www.data-intuitive.com"
role: "Data Scientist"
argument_groups:
- name: "Inputs"
arguments:
- type: "string"
name: "--id"
description: "ID of the sample."
info: null
example:
- "foo"
required: true
direction: "input"
multiple: false
multiple_sep: ";"
- type: "file"
name: "--input"
alternatives:
- "-i"
description: "Path to the sample."
info: null
example:
- "input.h5mu"
must_exist: true
create_parent: true
required: true
direction: "input"
multiple: false
multiple_sep: ";"
- type: "string"
name: "--modality"
description: "Which modality to process."
info: null
default:
- "rna"
required: false
direction: "input"
multiple: false
multiple_sep: ";"
- type: "string"
name: "--layer"
description: "Layer to calculate qc metrics for."
info: null
example:
- "raw_counts"
required: false
direction: "input"
multiple: false
multiple_sep: ";"
- name: "Mitochondrial & Ribosomal Gene Detection"
arguments:
- type: "string"
name: "--var_gene_names"
description: ".var column name to be used to detect mitochondrial/ribosomal genes\
\ instead of .var_names (default if not set).\nGene names matching with the\
\ regex value from --mitochondrial_gene_regex or --ribosomal_gene_regex will\
\ be \nidentified as mitochondrial or ribosomal genes, respectively.\n"
info: null
example:
- "gene_symbol"
required: false
direction: "input"
multiple: false
multiple_sep: ";"
- type: "string"
name: "--var_name_mitochondrial_genes"
description: "In which .var slot to store a boolean array corresponding the mitochondrial\
\ genes.\n"
info: null
required: false
direction: "input"
multiple: false
multiple_sep: ";"
- type: "string"
name: "--obs_name_mitochondrial_fraction"
description: ".Obs slot to store the fraction of reads found to be mitochondrial.\
\ Defaults to 'fraction_' suffixed by the value of --var_name_mitochondrial_genes\n"
info: null
required: false
direction: "input"
multiple: false
multiple_sep: ";"
- type: "string"
name: "--mitochondrial_gene_regex"
description: "Regex string that identifies mitochondrial genes from --var_gene_names.\n\
By default will detect human and mouse mitochondrial genes from a gene symbol.\n"
info: null
default:
- "^[mM][tT]-"
required: false
direction: "input"
multiple: false
multiple_sep: ";"
- type: "string"
name: "--var_name_ribosomal_genes"
description: "In which .var slot to store a boolean array corresponding the ribosomal\
\ genes.\n"
info: null
required: false
direction: "input"
multiple: false
multiple_sep: ";"
- type: "string"
name: "--obs_name_ribosomal_fraction"
description: "When specified, write the fraction of counts originating from ribosomal\
\ genes \n(based on --ribosomal_gene_regex) to an .obs column with the specified\
\ name.\nRequires --var_name_ribosomal_genes.\n"
info: null
required: false
direction: "input"
multiple: false
multiple_sep: ";"
- type: "string"
name: "--ribosomal_gene_regex"
description: "Regex string that identifies ribosomal genes from --var_gene_names.\n\
By default will detect human and mouse ribosomal genes from a gene symbol.\n"
info: null
default:
- "^[Mm]?[Rr][Pp][LlSs]"
required: false
direction: "input"
multiple: false
multiple_sep: ";"
- name: "QC metrics calculation options"
arguments:
- type: "string"
name: "--var_qc_metrics"
description: "Keys to select a boolean (containing only True or False) column\
\ from .var.\nFor each cell, calculate the proportion of total values for genes\
\ which are labeled 'True', \ncompared to the total sum of the values for all\
\ genes. Defaults to the value from\n--var_name_mitochondrial_genes.\n"
info: null
example:
- "ercc,highly_variable"
required: false
direction: "input"
multiple: true
multiple_sep: ","
- type: "integer"
name: "--top_n_vars"
description: "Number of top vars to be used to calculate cumulative proportions.\n\
If not specified, proportions are not calculated. `--top_n_vars 20,50` finds\n\
cumulative proportion to the 20th and 50th most expressed vars.\n"
info: null
default:
- 50
- 100
- 200
- 500
required: false
direction: "input"
multiple: true
multiple_sep: ","
- type: "string"
name: "--output_obs_num_nonzero_vars"
description: "Name of column in .obs describing, for each observation, the number\
\ of stored values\n(including explicit zeroes). In other words, the name of\
\ the column that counts\nfor each row the number of columns that contain data.\n"
info: null
default:
- "num_nonzero_vars"
required: false
direction: "input"
multiple: false
multiple_sep: ";"
- type: "string"
name: "--output_obs_total_counts_vars"
description: "Name of the column for .obs describing, for each observation (row),\n\
the sum of the stored values in the columns.\n"
info: null
default:
- "total_counts"
required: false
direction: "input"
multiple: false
multiple_sep: ";"
- type: "string"
name: "--output_var_num_nonzero_obs"
description: "Name of column describing, for each feature, the number of stored\
\ values\n(including explicit zeroes). In other words, the name of the column\
\ that counts\nfor each column the number of rows that contain data.\n"
info: null
default:
- "num_nonzero_obs"
required: false
direction: "input"
multiple: false
multiple_sep: ";"
- type: "string"
name: "--output_var_total_counts_obs"
description: "Name of the column in .var describing, for each feature (column),\n\
the sum of the stored values in the rows.\n"
info: null
default:
- "total_counts"
required: false
direction: "input"
multiple: false
multiple_sep: ";"
- type: "string"
name: "--output_var_obs_mean"
description: "Name of the column in .obs providing the mean of the values in each\
\ row.\n"
info: null
default:
- "obs_mean"
required: false
direction: "input"
multiple: false
multiple_sep: ";"
- type: "string"
name: "--output_var_pct_dropout"
description: "Name of the column in .obs providing for each feature the percentage\
\ of\nobservations the feature does not appear on (i.e. is missing). Same as\
\ `--output_var_num_nonzero_obs`\nbut percentage based.\n"
info: null
default:
- "pct_dropout"
required: false
direction: "input"
multiple: false
multiple_sep: ";"
- name: "Outputs"
arguments:
- type: "file"
name: "--output"
description: "Destination path to the output."
info: null
example:
- "output.h5mu"
must_exist: true
create_parent: true
required: true
direction: "output"
multiple: false
multiple_sep: ";"
resources:
- type: "nextflow_script"
path: "main.nf"
is_executable: true
entrypoint: "run_wf"
- type: "file"
path: "utils"
- type: "file"
path: "nextflow_labels.config"
dest: "nextflow_labels.config"
description: "A pipeline to add basic qc statistics to a MuData "
test_resources:
- type: "nextflow_script"
path: "test.nf"
is_executable: true
entrypoint: "test_wf"
- type: "file"
path: "concat_test_data"
- type: "file"
path: "pbmc_1k_protein_v3"
info:
test_dependencies:
- name: "qc_test"
namespace: "test_workflows/qc"
status: "enabled"
scope:
image: "public"
target: "public"
dependencies:
- name: "metadata/grep_annotation_column"
repository:
type: "local"
- name: "qc/calculate_qc_metrics"
repository:
type: "local"
license: "MIT"
links:
repository: "https://github.com/openpipelines-bio/openpipeline"
docker_registry: "ghcr.io"
runners:
- type: "nextflow"
id: "nextflow"
directives:
tag: "$id"
auto:
simplifyInput: true
simplifyOutput: false
transcript: false
publish: false
config:
labels:
mem1gb: "memory = 1000000000.B"
mem2gb: "memory = 2000000000.B"
mem5gb: "memory = 5000000000.B"
mem10gb: "memory = 10000000000.B"
mem20gb: "memory = 20000000000.B"
mem50gb: "memory = 50000000000.B"
mem100gb: "memory = 100000000000.B"
mem200gb: "memory = 200000000000.B"
mem500gb: "memory = 500000000000.B"
mem1tb: "memory = 1000000000000.B"
mem2tb: "memory = 2000000000000.B"
mem5tb: "memory = 5000000000000.B"
mem10tb: "memory = 10000000000000.B"
mem20tb: "memory = 20000000000000.B"
mem50tb: "memory = 50000000000000.B"
mem100tb: "memory = 100000000000000.B"
mem200tb: "memory = 200000000000000.B"
mem500tb: "memory = 500000000000000.B"
mem1gib: "memory = 1073741824.B"
mem2gib: "memory = 2147483648.B"
mem4gib: "memory = 4294967296.B"
mem8gib: "memory = 8589934592.B"
mem16gib: "memory = 17179869184.B"
mem32gib: "memory = 34359738368.B"
mem64gib: "memory = 68719476736.B"
mem128gib: "memory = 137438953472.B"
mem256gib: "memory = 274877906944.B"
mem512gib: "memory = 549755813888.B"
mem1tib: "memory = 1099511627776.B"
mem2tib: "memory = 2199023255552.B"
mem4tib: "memory = 4398046511104.B"
mem8tib: "memory = 8796093022208.B"
mem16tib: "memory = 17592186044416.B"
mem32tib: "memory = 35184372088832.B"
mem64tib: "memory = 70368744177664.B"
mem128tib: "memory = 140737488355328.B"
mem256tib: "memory = 281474976710656.B"
mem512tib: "memory = 562949953421312.B"
cpu1: "cpus = 1"
cpu2: "cpus = 2"
cpu5: "cpus = 5"
cpu10: "cpus = 10"
cpu20: "cpus = 20"
cpu50: "cpus = 50"
cpu100: "cpus = 100"
cpu200: "cpus = 200"
cpu500: "cpus = 500"
cpu1000: "cpus = 1000"
script:
- "includeConfig(\"nextflow_labels.config\")"
debug: false
container: "docker"
build_info:
config: "src/workflows/qc/qc/config.vsh.yaml"
runner: "nextflow"
engine: "native"
output: "target/nextflow/workflows/qc/qc"
executable: "target/nextflow/workflows/qc/qc/main.nf"
viash_version: "0.9.4"
git_commit: "a0c9522486585774f76416150f8a3291409b5363"
git_remote: "https://github.com/openpipelines-bio/openpipeline"
git_tag: "2.1.1-2-ga0c95224865"
dependencies:
- "target/nextflow/metadata/grep_annotation_column"
- "target/nextflow/qc/calculate_qc_metrics"
package_config:
name: "openpipeline"
summary: "Best-practice workflows for single-cell multi-omics analyses.\n"
description: "OpenPipelines are extensible single cell analysis pipelines for reproducible\
\ and large-scale single cell processing using [Viash](https://viash.io) and [Nextflow](https://www.nextflow.io/).\n\
\nIn terms of workflows, the following has been made available, but keep in mind\
\ that\nindividual tools and functionality can be executed as standalone components\
\ as well.\n\n * Demultiplexing: conversion of raw sequencing data to FASTQ objects.\n\
\ * Ingestion: Read mapping and generating a count matrix.\n * Single sample\
\ processing: cell filtering and doublet detection.\n * Multisample processing:\
\ Count transformation, normalization, QC metric calulations.\n * Integration:\
\ Clustering, integration and batch correction using single and multimodal methods.\n\
\ * Downstream analysis workflows\n"
info:
test_resources:
- type: "s3"
path: "s3://openpipelines-data"
dest: "resources_test"
viash_version: "0.9.4"
source: "src"
target: "target"
config_mods:
- ".resources += {path: '/src/workflows/utils/labels.config', dest: 'nextflow_labels.config'}\n\
.runners[.type == 'nextflow'].config.script := 'includeConfig(\"nextflow_labels.config\"\
)'"
- ".version := \"2.1.2\""
keywords:
- "single-cell"
- "multimodal"
license: "MIT"
organization: "openpipelines-bio"
links:
repository: "https://github.com/openpipelines-bio/openpipeline"
docker_registry: "ghcr.io"
homepage: "https://openpipelines.bio"
documentation: "https://openpipelines.bio/fundamentals"
issue_tracker: "https://github.com/openpipelines-bio/openpipeline/issues"

View File

@@ -0,0 +1,126 @@
manifest {
name = 'workflows/qc/qc'
mainScript = 'main.nf'
nextflowVersion = '!>=20.12.1-edge'
version = '2.1.2'
description = 'A pipeline to add basic qc statistics to a MuData '
author = 'Dries Schaumont'
}
process.container = 'nextflow/bash:latest'
// detect tempdir
tempDir = java.nio.file.Paths.get(
System.getenv('NXF_TEMP') ?:
System.getenv('VIASH_TEMP') ?:
System.getenv('TEMPDIR') ?:
System.getenv('TMPDIR') ?:
'/tmp'
).toAbsolutePath()
profiles {
no_publish {
process {
withName: '.*' {
publishDir = [
enabled: false
]
}
}
}
mount_temp {
docker.temp = tempDir
podman.temp = tempDir
charliecloud.temp = tempDir
}
docker {
docker.enabled = true
// docker.userEmulation = true
singularity.enabled = false
podman.enabled = false
shifter.enabled = false
charliecloud.enabled = false
}
singularity {
singularity.enabled = true
singularity.autoMounts = true
docker.enabled = false
podman.enabled = false
shifter.enabled = false
charliecloud.enabled = false
}
podman {
podman.enabled = true
docker.enabled = false
singularity.enabled = false
shifter.enabled = false
charliecloud.enabled = false
}
shifter {
shifter.enabled = true
docker.enabled = false
singularity.enabled = false
podman.enabled = false
charliecloud.enabled = false
}
charliecloud {
charliecloud.enabled = true
docker.enabled = false
singularity.enabled = false
podman.enabled = false
shifter.enabled = false
}
}
process{
withLabel: mem1gb { memory = 1000000000.B }
withLabel: mem2gb { memory = 2000000000.B }
withLabel: mem5gb { memory = 5000000000.B }
withLabel: mem10gb { memory = 10000000000.B }
withLabel: mem20gb { memory = 20000000000.B }
withLabel: mem50gb { memory = 50000000000.B }
withLabel: mem100gb { memory = 100000000000.B }
withLabel: mem200gb { memory = 200000000000.B }
withLabel: mem500gb { memory = 500000000000.B }
withLabel: mem1tb { memory = 1000000000000.B }
withLabel: mem2tb { memory = 2000000000000.B }
withLabel: mem5tb { memory = 5000000000000.B }
withLabel: mem10tb { memory = 10000000000000.B }
withLabel: mem20tb { memory = 20000000000000.B }
withLabel: mem50tb { memory = 50000000000000.B }
withLabel: mem100tb { memory = 100000000000000.B }
withLabel: mem200tb { memory = 200000000000000.B }
withLabel: mem500tb { memory = 500000000000000.B }
withLabel: mem1gib { memory = 1073741824.B }
withLabel: mem2gib { memory = 2147483648.B }
withLabel: mem4gib { memory = 4294967296.B }
withLabel: mem8gib { memory = 8589934592.B }
withLabel: mem16gib { memory = 17179869184.B }
withLabel: mem32gib { memory = 34359738368.B }
withLabel: mem64gib { memory = 68719476736.B }
withLabel: mem128gib { memory = 137438953472.B }
withLabel: mem256gib { memory = 274877906944.B }
withLabel: mem512gib { memory = 549755813888.B }
withLabel: mem1tib { memory = 1099511627776.B }
withLabel: mem2tib { memory = 2199023255552.B }
withLabel: mem4tib { memory = 4398046511104.B }
withLabel: mem8tib { memory = 8796093022208.B }
withLabel: mem16tib { memory = 17592186044416.B }
withLabel: mem32tib { memory = 35184372088832.B }
withLabel: mem64tib { memory = 70368744177664.B }
withLabel: mem128tib { memory = 140737488355328.B }
withLabel: mem256tib { memory = 281474976710656.B }
withLabel: mem512tib { memory = 562949953421312.B }
withLabel: cpu1 { cpus = 1 }
withLabel: cpu2 { cpus = 2 }
withLabel: cpu5 { cpus = 5 }
withLabel: cpu10 { cpus = 10 }
withLabel: cpu20 { cpus = 20 }
withLabel: cpu50 { cpus = 50 }
withLabel: cpu100 { cpus = 100 }
withLabel: cpu200 { cpus = 200 }
withLabel: cpu500 { cpus = 500 }
withLabel: cpu1000 { cpus = 1000 }
}
includeConfig("nextflow_labels.config")

View File

@@ -0,0 +1,66 @@
process {
// Default resources for components that hardly do any processing
memory = { 2.GB * task.attempt }
cpus = 1
// Retry for exit codes that have something to do with memory issues
errorStrategy = { task.exitStatus in 137..140 ? 'retry' : 'terminate' }
maxRetries = 3
maxMemory = null
// CPU resources
withLabel: singlecpu { cpus = 1 }
withLabel: lowcpu { cpus = 4 }
withLabel: midcpu { cpus = 10 }
withLabel: highcpu { cpus = 20 }
// Memory resources
withLabel: lowmem { memory = { get_memory( 4.GB * task.attempt ) } }
withLabel: midmem { memory = { get_memory( 25.GB * task.attempt ) } }
withLabel: highmem { memory = { get_memory( 50.GB * task.attempt ) } }
withLabel: veryhighmem { memory = { get_memory( 75.GB * task.attempt ) } }
// Disk space
withLabel: lowdisk {
disk = {process.disk ? process.disk : null}
}
withLabel: middisk {
disk = {process.disk ? process.disk : null}
}
withLabel: highdisk {
disk = {process.disk ? process.disk : null}
}
withLabel: veryhighdisk {
disk = {process.disk ? process.disk : null}
}
// NOTE: The above labels intentionally do not have an effect by default.
// The user should set the disk space requirements by adding the following
// to the compute environment:
//
// withLabel: lowdisk { disk = { 20.GB * task.attempt } }
// withLabel: middisk { disk = { 100.GB * task.attempt } }
// withLabel: highdisk { disk = { 200.GB * task.attempt } }
// withLabel: veryhighdisk { disk = { 500.GB * task.attempt } }
}
def get_memory(to_compare) {
if (!process.containsKey("maxMemory") || !process.maxMemory) {
return to_compare
}
try {
if (process.containsKey("maxRetries") && process.maxRetries && task.attempt == (process.maxRetries as int)) {
return process.maxMemory
}
else if (to_compare.compareTo(process.maxMemory as nextflow.util.MemoryUnit) == 1) {
return max_memory as nextflow.util.MemoryUnit
}
else {
return to_compare
}
} catch (all) {
println "Error processing memory resources. Please check that process.maxMemory '${process.maxMemory}' and process.maxRetries '${process.maxRetries}' are valid!"
System.exit(1)
}
}

View File

@@ -0,0 +1,33 @@
# Inputs
id: # please fill in - example: "foo"
input: # please fill in - example: "input.h5mu"
modality: "rna"
# layer: "raw_counts"
# Mitochondrial & Ribosomal Gene Detection
# var_gene_names: "gene_symbol"
# var_name_mitochondrial_genes: "foo"
# obs_name_mitochondrial_fraction: "foo"
mitochondrial_gene_regex: "^[mM][tT]-"
# var_name_ribosomal_genes: "foo"
# obs_name_ribosomal_fraction: "foo"
ribosomal_gene_regex: "^[Mm]?[Rr][Pp][LlSs]"
# QC metrics calculation options
# var_qc_metrics: ["ercc,highly_variable"]
top_n_vars: [50, 100, 200, 500]
output_obs_num_nonzero_vars: "num_nonzero_vars"
output_obs_total_counts_vars: "total_counts"
output_var_num_nonzero_obs: "num_nonzero_obs"
output_var_total_counts_obs: "total_counts"
output_var_obs_mean: "obs_mean"
output_var_pct_dropout: "pct_dropout"
# Outputs
# output: "$id.$key.output.h5mu"
# Nextflow input-output arguments
publish_dir: # please fill in - example: "output/"
# param_list: "my_params.yaml"
# Arguments

View File

@@ -0,0 +1,320 @@
{
"$schema": "http://json-schema.org/draft-07/schema",
"title": "qc",
"description": "A pipeline to add basic qc statistics to a MuData ",
"type": "object",
"definitions": {
"Dataset input": {
"title": "Dataset input",
"type": "object",
"description": "Dataset input using nf-tower \"dataset\" or \"data explorer\". Allows for the input of multiple parameter sets to initialise a Nextflow channel.",
"properties": {
"param_list": {
"description": "Dataset input can either be a list of maps, a csv file, a json file, a yaml file, or simply a yaml blob. The names of the input fields (e.g. csv columns, json keys) need to be an exact match with the workflow input parameters.",
"default": "",
"format": "file-path",
"mimetype": "text/csv",
"pattern": "^\\S+\\.csv$"
}
}
},
"inputs" : {
"title": "Inputs",
"type": "object",
"description": "No description",
"properties": {
"id": {
"type":
"string",
"description": "Type: `string`, required, example: `foo`. ID of the sample",
"help_text": "Type: `string`, required, example: `foo`. ID of the sample."
}
,
"input": {
"type":
"string",
"description": "Type: `file`, required, example: `input.h5mu`. Path to the sample",
"help_text": "Type: `file`, required, example: `input.h5mu`. Path to the sample."
}
,
"modality": {
"type":
"string",
"description": "Type: `string`, default: `rna`. Which modality to process",
"help_text": "Type: `string`, default: `rna`. Which modality to process."
,
"default":"rna"
}
,
"layer": {
"type":
"string",
"description": "Type: `string`, example: `raw_counts`. Layer to calculate qc metrics for",
"help_text": "Type: `string`, example: `raw_counts`. Layer to calculate qc metrics for."
}
}
},
"outputs" : {
"title": "Outputs",
"type": "object",
"description": "No description",
"properties": {
"output": {
"type":
"string",
"description": "Type: `file`, required, default: `$id.$key.output.h5mu`, example: `output.h5mu`. Destination path to the output",
"help_text": "Type: `file`, required, default: `$id.$key.output.h5mu`, example: `output.h5mu`. Destination path to the output."
,
"default":"$id.$key.output.h5mu"
}
}
},
"mitochondrial & ribosomal gene detection" : {
"title": "Mitochondrial & Ribosomal Gene Detection",
"type": "object",
"description": "No description",
"properties": {
"var_gene_names": {
"type":
"string",
"description": "Type: `string`, example: `gene_symbol`. ",
"help_text": "Type: `string`, example: `gene_symbol`. .var column name to be used to detect mitochondrial/ribosomal genes instead of .var_names (default if not set).\nGene names matching with the regex value from --mitochondrial_gene_regex or --ribosomal_gene_regex will be \nidentified as mitochondrial or ribosomal genes, respectively.\n"
}
,
"var_name_mitochondrial_genes": {
"type":
"string",
"description": "Type: `string`. In which ",
"help_text": "Type: `string`. In which .var slot to store a boolean array corresponding the mitochondrial genes.\n"
}
,
"obs_name_mitochondrial_fraction": {
"type":
"string",
"description": "Type: `string`. ",
"help_text": "Type: `string`. .Obs slot to store the fraction of reads found to be mitochondrial. Defaults to \u0027fraction_\u0027 suffixed by the value of --var_name_mitochondrial_genes\n"
}
,
"mitochondrial_gene_regex": {
"type":
"string",
"description": "Type: `string`, default: `^[mM][tT]-`. Regex string that identifies mitochondrial genes from --var_gene_names",
"help_text": "Type: `string`, default: `^[mM][tT]-`. Regex string that identifies mitochondrial genes from --var_gene_names.\nBy default will detect human and mouse mitochondrial genes from a gene symbol.\n"
,
"default":"^[mM][tT]-"
}
,
"var_name_ribosomal_genes": {
"type":
"string",
"description": "Type: `string`. In which ",
"help_text": "Type: `string`. In which .var slot to store a boolean array corresponding the ribosomal genes.\n"
}
,
"obs_name_ribosomal_fraction": {
"type":
"string",
"description": "Type: `string`. When specified, write the fraction of counts originating from ribosomal genes \n(based on --ribosomal_gene_regex) to an ",
"help_text": "Type: `string`. When specified, write the fraction of counts originating from ribosomal genes \n(based on --ribosomal_gene_regex) to an .obs column with the specified name.\nRequires --var_name_ribosomal_genes.\n"
}
,
"ribosomal_gene_regex": {
"type":
"string",
"description": "Type: `string`, default: `^[Mm]?[Rr][Pp][LlSs]`. Regex string that identifies ribosomal genes from --var_gene_names",
"help_text": "Type: `string`, default: `^[Mm]?[Rr][Pp][LlSs]`. Regex string that identifies ribosomal genes from --var_gene_names.\nBy default will detect human and mouse ribosomal genes from a gene symbol.\n"
,
"default":"^[Mm]?[Rr][Pp][LlSs]"
}
}
},
"qc metrics calculation options" : {
"title": "QC metrics calculation options",
"type": "object",
"description": "No description",
"properties": {
"var_qc_metrics": {
"type":
"string",
"description": "Type: List of `string`, example: `ercc,highly_variable`, multiple_sep: `\",\"`. Keys to select a boolean (containing only True or False) column from ",
"help_text": "Type: List of `string`, example: `ercc,highly_variable`, multiple_sep: `\",\"`. Keys to select a boolean (containing only True or False) column from .var.\nFor each cell, calculate the proportion of total values for genes which are labeled \u0027True\u0027, \ncompared to the total sum of the values for all genes. Defaults to the value from\n--var_name_mitochondrial_genes.\n"
}
,
"top_n_vars": {
"type":
"string",
"description": "Type: List of `integer`, default: `50,100,200,500`, multiple_sep: `\",\"`. Number of top vars to be used to calculate cumulative proportions",
"help_text": "Type: List of `integer`, default: `50,100,200,500`, multiple_sep: `\",\"`. Number of top vars to be used to calculate cumulative proportions.\nIf not specified, proportions are not calculated. `--top_n_vars 20,50` finds\ncumulative proportion to the 20th and 50th most expressed vars.\n"
,
"default":"50,100,200,500"
}
,
"output_obs_num_nonzero_vars": {
"type":
"string",
"description": "Type: `string`, default: `num_nonzero_vars`. Name of column in ",
"help_text": "Type: `string`, default: `num_nonzero_vars`. Name of column in .obs describing, for each observation, the number of stored values\n(including explicit zeroes). In other words, the name of the column that counts\nfor each row the number of columns that contain data.\n"
,
"default":"num_nonzero_vars"
}
,
"output_obs_total_counts_vars": {
"type":
"string",
"description": "Type: `string`, default: `total_counts`. Name of the column for ",
"help_text": "Type: `string`, default: `total_counts`. Name of the column for .obs describing, for each observation (row),\nthe sum of the stored values in the columns.\n"
,
"default":"total_counts"
}
,
"output_var_num_nonzero_obs": {
"type":
"string",
"description": "Type: `string`, default: `num_nonzero_obs`. Name of column describing, for each feature, the number of stored values\n(including explicit zeroes)",
"help_text": "Type: `string`, default: `num_nonzero_obs`. Name of column describing, for each feature, the number of stored values\n(including explicit zeroes). In other words, the name of the column that counts\nfor each column the number of rows that contain data.\n"
,
"default":"num_nonzero_obs"
}
,
"output_var_total_counts_obs": {
"type":
"string",
"description": "Type: `string`, default: `total_counts`. Name of the column in ",
"help_text": "Type: `string`, default: `total_counts`. Name of the column in .var describing, for each feature (column),\nthe sum of the stored values in the rows.\n"
,
"default":"total_counts"
}
,
"output_var_obs_mean": {
"type":
"string",
"description": "Type: `string`, default: `obs_mean`. Name of the column in ",
"help_text": "Type: `string`, default: `obs_mean`. Name of the column in .obs providing the mean of the values in each row.\n"
,
"default":"obs_mean"
}
,
"output_var_pct_dropout": {
"type":
"string",
"description": "Type: `string`, default: `pct_dropout`. Name of the column in ",
"help_text": "Type: `string`, default: `pct_dropout`. Name of the column in .obs providing for each feature the percentage of\nobservations the feature does not appear on (i.e. is missing). Same as `--output_var_num_nonzero_obs`\nbut percentage based.\n"
,
"default":"pct_dropout"
}
}
},
"nextflow input-output arguments" : {
"title": "Nextflow input-output arguments",
"type": "object",
"description": "Input/output parameters for Nextflow itself. Please note that both publishDir and publish_dir are supported but at least one has to be configured.",
"properties": {
"publish_dir": {
"type":
"string",
"description": "Type: `string`, required, example: `output/`. Path to an output directory",
"help_text": "Type: `string`, required, example: `output/`. Path to an output directory."
}
}
}
},
"allOf": [
{
"$ref": "#/definitions/inputs"
},
{
"$ref": "#/definitions/outputs"
},
{
"$ref": "#/definitions/mitochondrial & ribosomal gene detection"
},
{
"$ref": "#/definitions/qc metrics calculation options"
},
{
"$ref": "#/definitions/nextflow input-output arguments"
}
]
}

View File

@@ -0,0 +1,36 @@
profiles {
// detect tempdir
tempDir = java.nio.file.Paths.get(
System.getenv('NXF_TEMP') ?:
System.getenv('VIASH_TEMP') ?:
System.getenv('TEMPDIR') ?:
System.getenv('TMPDIR') ?:
'/tmp'
).toAbsolutePath()
mount_temp {
docker.temp = tempDir
podman.temp = tempDir
charliecloud.temp = tempDir
}
no_publish {
process {
withName: '.*' {
publishDir = [
enabled: false
]
}
}
}
docker {
docker.enabled = true
// docker.userEmulation = true
singularity.enabled = false
podman.enabled = false
shifter.enabled = false
charliecloud.enabled = false
}
}

View File

@@ -0,0 +1,66 @@
process {
// Default resources for components that hardly do any processing
memory = { 2.GB * task.attempt }
cpus = 1
// Retry for exit codes that have something to do with memory issues
errorStrategy = { task.exitStatus in 137..140 ? 'retry' : 'terminate' }
maxRetries = 3
maxMemory = null
// CPU resources
withLabel: singlecpu { cpus = 1 }
withLabel: lowcpu { cpus = 4 }
withLabel: midcpu { cpus = 10 }
withLabel: highcpu { cpus = 20 }
// Memory resources
withLabel: lowmem { memory = { get_memory( 4.GB * task.attempt ) } }
withLabel: midmem { memory = { get_memory( 25.GB * task.attempt ) } }
withLabel: highmem { memory = { get_memory( 50.GB * task.attempt ) } }
withLabel: veryhighmem { memory = { get_memory( 75.GB * task.attempt ) } }
// Disk space
withLabel: lowdisk {
disk = {process.disk ? process.disk : null}
}
withLabel: middisk {
disk = {process.disk ? process.disk : null}
}
withLabel: highdisk {
disk = {process.disk ? process.disk : null}
}
withLabel: veryhighdisk {
disk = {process.disk ? process.disk : null}
}
// NOTE: The above labels intentionally do not have an effect by default.
// The user should set the disk space requirements by adding the following
// to the compute environment:
//
// withLabel: lowdisk { disk = { 20.GB * task.attempt } }
// withLabel: middisk { disk = { 100.GB * task.attempt } }
// withLabel: highdisk { disk = { 200.GB * task.attempt } }
// withLabel: veryhighdisk { disk = { 500.GB * task.attempt } }
}
def get_memory(to_compare) {
if (!process.containsKey("maxMemory") || !process.maxMemory) {
return to_compare
}
try {
if (process.containsKey("maxRetries") && process.maxRetries && task.attempt == (process.maxRetries as int)) {
return process.maxMemory
}
else if (to_compare.compareTo(process.maxMemory as nextflow.util.MemoryUnit) == 1) {
return max_memory as nextflow.util.MemoryUnit
}
else {
return to_compare
}
} catch (all) {
println "Error processing memory resources. Please check that process.maxMemory '${process.maxMemory}' and process.maxRetries '${process.maxRetries}' are valid!"
System.exit(1)
}
}

View File

@@ -0,0 +1,33 @@
process {
withLabel: lowmem { memory = 13.Gb }
withLabel: lowcpu { cpus = 4 }
withLabel: midmem { memory = 13.Gb }
withLabel: midcpu { cpus = 4 }
withLabel: highmem { memory = 13.Gb }
withLabel: highcpu { cpus = 4 }
withLabel: veryhighmem { memory = 13.Gb }
withLabel: lowdisk {
disk = {process.disk ? process.disk : null}
}
withLabel: middisk {
disk = {process.disk ? process.disk : null}
}
withLabel: highdisk {
disk = {process.disk ? process.disk : null}
}
withLabel: veryhighdisk {
disk = {process.disk ? process.disk : null}
}
}
env.NUMBA_CACHE_DIR = '/tmp'
trace {
enabled = true
overwrite = true
}
dag {
overwrite = true
}
process.maxForks = 1

View File

@@ -0,0 +1,224 @@
name: "split_modalities"
namespace: "workflows/multiomics"
version: "disable-scrublet_build"
authors:
- name: "Dries Schaumont"
roles:
- "author"
- "maintainer"
info:
role: "Core Team Member"
links:
email: "dries@data-intuitive.com"
github: "DriesSchaumont"
orcid: "0000-0002-4389-0440"
linkedin: "dries-schaumont"
organizations:
- name: "Data Intuitive"
href: "https://www.data-intuitive.com"
role: "Data Scientist"
argument_groups:
- name: "Inputs"
arguments:
- type: "string"
name: "--id"
description: "ID of the sample."
info: null
example:
- "foo"
required: true
direction: "input"
multiple: false
multiple_sep: ";"
- type: "file"
name: "--input"
alternatives:
- "-i"
description: "Path to the sample."
info: null
example:
- "input.h5mu"
must_exist: true
create_parent: true
required: true
direction: "input"
multiple: false
multiple_sep: ";"
- name: "Outputs"
arguments:
- type: "file"
name: "--output"
alternatives:
- "-o"
description: "Output directory containing multiple h5mu files."
info: null
example:
- "/path/to/output"
must_exist: true
create_parent: true
required: true
direction: "output"
multiple: false
multiple_sep: ";"
- type: "file"
name: "--output_types"
description: "A csv containing the base filename and modality type per output\
\ file."
info: null
example:
- "types.csv"
must_exist: true
create_parent: true
required: true
direction: "output"
multiple: false
multiple_sep: ";"
resources:
- type: "nextflow_script"
path: "main.nf"
is_executable: true
entrypoint: "run_wf"
- type: "file"
path: "utils"
- type: "file"
path: "nextflow_labels.config"
dest: "nextflow_labels.config"
description: "A pipeline to split a multimodal mudata files into several unimodal\
\ mudata files."
test_resources:
- type: "nextflow_script"
path: "test.nf"
is_executable: true
entrypoint: "test_wf"
- type: "file"
path: "pbmc_1k_protein_v3_filtered_feature_bc_matrix.h5mu"
info:
test_dependencies:
- name: "split_modalities_test"
namespace: "test_workflows/multiomics"
status: "enabled"
scope:
image: "private"
target: "private"
dependencies:
- name: "dataflow/split_modalities"
alias: "split_modalities_component"
repository:
type: "local"
license: "MIT"
links:
repository: "https://github.com/openpipelines-bio/openpipeline"
docker_registry: "ghcr.io"
runners:
- type: "nextflow"
id: "nextflow"
directives:
tag: "$id"
auto:
simplifyInput: true
simplifyOutput: false
transcript: false
publish: false
config:
labels:
mem1gb: "memory = 1000000000.B"
mem2gb: "memory = 2000000000.B"
mem5gb: "memory = 5000000000.B"
mem10gb: "memory = 10000000000.B"
mem20gb: "memory = 20000000000.B"
mem50gb: "memory = 50000000000.B"
mem100gb: "memory = 100000000000.B"
mem200gb: "memory = 200000000000.B"
mem500gb: "memory = 500000000000.B"
mem1tb: "memory = 1000000000000.B"
mem2tb: "memory = 2000000000000.B"
mem5tb: "memory = 5000000000000.B"
mem10tb: "memory = 10000000000000.B"
mem20tb: "memory = 20000000000000.B"
mem50tb: "memory = 50000000000000.B"
mem100tb: "memory = 100000000000000.B"
mem200tb: "memory = 200000000000000.B"
mem500tb: "memory = 500000000000000.B"
mem1gib: "memory = 1073741824.B"
mem2gib: "memory = 2147483648.B"
mem4gib: "memory = 4294967296.B"
mem8gib: "memory = 8589934592.B"
mem16gib: "memory = 17179869184.B"
mem32gib: "memory = 34359738368.B"
mem64gib: "memory = 68719476736.B"
mem128gib: "memory = 137438953472.B"
mem256gib: "memory = 274877906944.B"
mem512gib: "memory = 549755813888.B"
mem1tib: "memory = 1099511627776.B"
mem2tib: "memory = 2199023255552.B"
mem4tib: "memory = 4398046511104.B"
mem8tib: "memory = 8796093022208.B"
mem16tib: "memory = 17592186044416.B"
mem32tib: "memory = 35184372088832.B"
mem64tib: "memory = 70368744177664.B"
mem128tib: "memory = 140737488355328.B"
mem256tib: "memory = 281474976710656.B"
mem512tib: "memory = 562949953421312.B"
cpu1: "cpus = 1"
cpu2: "cpus = 2"
cpu5: "cpus = 5"
cpu10: "cpus = 10"
cpu20: "cpus = 20"
cpu50: "cpus = 50"
cpu100: "cpus = 100"
cpu200: "cpus = 200"
cpu500: "cpus = 500"
cpu1000: "cpus = 1000"
script:
- "includeConfig(\"nextflow_labels.config\")"
debug: false
container: "docker"
build_info:
config: "src/workflows/multiomics/split_modalities/config.vsh.yaml"
runner: "nextflow"
engine: "native"
output: "target/_private/nextflow/workflows/multiomics/split_modalities"
executable: "target/_private/nextflow/workflows/multiomics/split_modalities/main.nf"
viash_version: "0.9.4"
git_commit: "07297b53180b28c8e198414328683e941eec7ed0"
git_remote: "https://github.com/openpipelines-bio/openpipeline"
git_tag: "0.2.0-2044-g07297b53180"
dependencies:
- "target/nextflow/dataflow/split_modalities"
package_config:
name: "openpipeline"
summary: "Best-practice workflows for single-cell multi-omics analyses.\n"
description: "OpenPipelines are extensible single cell analysis pipelines for reproducible\
\ and large-scale single cell processing using [Viash](https://viash.io) and [Nextflow](https://www.nextflow.io/).\n\
\nIn terms of workflows, the following has been made available, but keep in mind\
\ that\nindividual tools and functionality can be executed as standalone components\
\ as well.\n\n * Demultiplexing: conversion of raw sequencing data to FASTQ objects.\n\
\ * Ingestion: Read mapping and generating a count matrix.\n * Single sample\
\ processing: cell filtering and doublet detection.\n * Multisample processing:\
\ Count transformation, normalization, QC metric calulations.\n * Integration:\
\ Clustering, integration and batch correction using single and multimodal methods.\n\
\ * Downstream analysis workflows\n"
info:
test_resources:
- type: "s3"
path: "s3://openpipelines-data"
dest: "resources_test"
viash_version: "0.9.4"
source: "src"
target: "target"
config_mods:
- ".resources += {path: '/src/workflows/utils/labels.config', dest: 'nextflow_labels.config'}\n\
.runners[.type == 'nextflow'].config.script := 'includeConfig(\"nextflow_labels.config\"\
)'"
- ".version := \"disable-scrublet_build\""
keywords:
- "single-cell"
- "multimodal"
license: "MIT"
organization: "openpipelines-bio"
links:
repository: "https://github.com/openpipelines-bio/openpipeline"
docker_registry: "ghcr.io"
homepage: "https://openpipelines.bio"
documentation: "https://openpipelines.bio/fundamentals"
issue_tracker: "https://github.com/openpipelines-bio/openpipeline/issues"

View File

@@ -0,0 +1,126 @@
manifest {
name = 'workflows/multiomics/split_modalities'
mainScript = 'main.nf'
nextflowVersion = '!>=20.12.1-edge'
version = 'disable-scrublet_build'
description = 'A pipeline to split a multimodal mudata files into several unimodal mudata files.'
author = 'Dries Schaumont'
}
process.container = 'nextflow/bash:latest'
// detect tempdir
tempDir = java.nio.file.Paths.get(
System.getenv('NXF_TEMP') ?:
System.getenv('VIASH_TEMP') ?:
System.getenv('TEMPDIR') ?:
System.getenv('TMPDIR') ?:
'/tmp'
).toAbsolutePath()
profiles {
no_publish {
process {
withName: '.*' {
publishDir = [
enabled: false
]
}
}
}
mount_temp {
docker.temp = tempDir
podman.temp = tempDir
charliecloud.temp = tempDir
}
docker {
docker.enabled = true
// docker.userEmulation = true
singularity.enabled = false
podman.enabled = false
shifter.enabled = false
charliecloud.enabled = false
}
singularity {
singularity.enabled = true
singularity.autoMounts = true
docker.enabled = false
podman.enabled = false
shifter.enabled = false
charliecloud.enabled = false
}
podman {
podman.enabled = true
docker.enabled = false
singularity.enabled = false
shifter.enabled = false
charliecloud.enabled = false
}
shifter {
shifter.enabled = true
docker.enabled = false
singularity.enabled = false
podman.enabled = false
charliecloud.enabled = false
}
charliecloud {
charliecloud.enabled = true
docker.enabled = false
singularity.enabled = false
podman.enabled = false
shifter.enabled = false
}
}
process{
withLabel: mem1gb { memory = 1000000000.B }
withLabel: mem2gb { memory = 2000000000.B }
withLabel: mem5gb { memory = 5000000000.B }
withLabel: mem10gb { memory = 10000000000.B }
withLabel: mem20gb { memory = 20000000000.B }
withLabel: mem50gb { memory = 50000000000.B }
withLabel: mem100gb { memory = 100000000000.B }
withLabel: mem200gb { memory = 200000000000.B }
withLabel: mem500gb { memory = 500000000000.B }
withLabel: mem1tb { memory = 1000000000000.B }
withLabel: mem2tb { memory = 2000000000000.B }
withLabel: mem5tb { memory = 5000000000000.B }
withLabel: mem10tb { memory = 10000000000000.B }
withLabel: mem20tb { memory = 20000000000000.B }
withLabel: mem50tb { memory = 50000000000000.B }
withLabel: mem100tb { memory = 100000000000000.B }
withLabel: mem200tb { memory = 200000000000000.B }
withLabel: mem500tb { memory = 500000000000000.B }
withLabel: mem1gib { memory = 1073741824.B }
withLabel: mem2gib { memory = 2147483648.B }
withLabel: mem4gib { memory = 4294967296.B }
withLabel: mem8gib { memory = 8589934592.B }
withLabel: mem16gib { memory = 17179869184.B }
withLabel: mem32gib { memory = 34359738368.B }
withLabel: mem64gib { memory = 68719476736.B }
withLabel: mem128gib { memory = 137438953472.B }
withLabel: mem256gib { memory = 274877906944.B }
withLabel: mem512gib { memory = 549755813888.B }
withLabel: mem1tib { memory = 1099511627776.B }
withLabel: mem2tib { memory = 2199023255552.B }
withLabel: mem4tib { memory = 4398046511104.B }
withLabel: mem8tib { memory = 8796093022208.B }
withLabel: mem16tib { memory = 17592186044416.B }
withLabel: mem32tib { memory = 35184372088832.B }
withLabel: mem64tib { memory = 70368744177664.B }
withLabel: mem128tib { memory = 140737488355328.B }
withLabel: mem256tib { memory = 281474976710656.B }
withLabel: mem512tib { memory = 562949953421312.B }
withLabel: cpu1 { cpus = 1 }
withLabel: cpu2 { cpus = 2 }
withLabel: cpu5 { cpus = 5 }
withLabel: cpu10 { cpus = 10 }
withLabel: cpu20 { cpus = 20 }
withLabel: cpu50 { cpus = 50 }
withLabel: cpu100 { cpus = 100 }
withLabel: cpu200 { cpus = 200 }
withLabel: cpu500 { cpus = 500 }
withLabel: cpu1000 { cpus = 1000 }
}
includeConfig("nextflow_labels.config")

View File

@@ -0,0 +1,66 @@
process {
// Default resources for components that hardly do any processing
memory = { 2.GB * task.attempt }
cpus = 1
// Retry for exit codes that have something to do with memory issues
errorStrategy = { task.exitStatus in 137..140 ? 'retry' : 'terminate' }
maxRetries = 3
maxMemory = null
// CPU resources
withLabel: singlecpu { cpus = 1 }
withLabel: lowcpu { cpus = 4 }
withLabel: midcpu { cpus = 10 }
withLabel: highcpu { cpus = 20 }
// Memory resources
withLabel: lowmem { memory = { get_memory( 4.GB * task.attempt ) } }
withLabel: midmem { memory = { get_memory( 25.GB * task.attempt ) } }
withLabel: highmem { memory = { get_memory( 50.GB * task.attempt ) } }
withLabel: veryhighmem { memory = { get_memory( 75.GB * task.attempt ) } }
// Disk space
withLabel: lowdisk {
disk = {process.disk ? process.disk : null}
}
withLabel: middisk {
disk = {process.disk ? process.disk : null}
}
withLabel: highdisk {
disk = {process.disk ? process.disk : null}
}
withLabel: veryhighdisk {
disk = {process.disk ? process.disk : null}
}
// NOTE: The above labels intentionally do not have an effect by default.
// The user should set the disk space requirements by adding the following
// to the compute environment:
//
// withLabel: lowdisk { disk = { 20.GB * task.attempt } }
// withLabel: middisk { disk = { 100.GB * task.attempt } }
// withLabel: highdisk { disk = { 200.GB * task.attempt } }
// withLabel: veryhighdisk { disk = { 500.GB * task.attempt } }
}
def get_memory(to_compare) {
if (!process.containsKey("maxMemory") || !process.maxMemory) {
return to_compare
}
try {
if (process.containsKey("maxRetries") && process.maxRetries && task.attempt == (process.maxRetries as int)) {
return process.maxMemory
}
else if (to_compare.compareTo(process.maxMemory as nextflow.util.MemoryUnit) == 1) {
return max_memory as nextflow.util.MemoryUnit
}
else {
return to_compare
}
} catch (all) {
println "Error processing memory resources. Please check that process.maxMemory '${process.maxMemory}' and process.maxRetries '${process.maxRetries}' are valid!"
System.exit(1)
}
}

Some files were not shown because too many files have changed in this diff Show More