Files
openpipeline/src/annotate/celltypist/config.vsh.yaml
2026-06-09 18:32:44 +00:00

166 lines
6.0 KiB
YAML

name: celltypist
namespace: annotate
scope: "public"
description: Automated cell type annotation tool for scRNA-seq datasets on the basis of logistic regression classifiers optimised by the stochastic gradient descent algorithm.
authors:
- __merge__: /src/authors/jakub_majercik.yaml
roles: [ author ]
- __merge__: /src/authors/weiwei_schultz.yaml
roles: [ contributor ]
argument_groups:
- name: Inputs
description: Input dataset (query) arguments
arguments:
- name: "--input"
alternatives: [-i]
type: file
description: The input (query) data to be labeled. Should be a .h5mu file.
direction: input
required: true
example: input.h5mu
- name: "--modality"
description: Which modality to process.
type: string
default: "rna"
required: false
- name: "--input_layer"
type: string
default: log_normalized
description: The layer in the input data containing counts that are lognormalized to 10000, .X is not to be used.
- name: "--input_var_gene_names"
type: string
required: false
description: |
The name of the adata var column in the input data containing gene names; when no gene_name_layer is provided, the var index will be used.
- name: "--input_reference_gene_overlap"
type: integer
default: 100
min: 1
description: |
The minimum number of genes present in both the reference and query datasets.
- name: "--sanitize_ensembl_ids"
type: boolean
description: Whether to sanitize ensembl ids by removing version numbers.
default: true
- name: Reference
description: Arguments related to the reference dataset.
arguments:
- name: "--reference"
type: file
description: "The reference data to train the CellTypist classifiers on. Only required if a pre-trained --model is not provided."
example: reference.h5mu
direction: input
required: false
- name: "--reference_layer"
type: string
description: The layer in the reference data containing counts that are lognormalized to 10000, if .X is not to be used.
required: false
- name: "--reference_obs_target"
type: string
description: The name of the adata obs column in the reference data containing cell type annotations.
default: "cell_ontology_class"
- name: "--reference_var_input"
type: string
default: "filter_with_hvg"
required: false
description: |
.var column containing highly variable genes. If not provided, genes will not be subset.
- name: "--reference_var_gene_names"
type: string
required: false
description: |
The name of the adata var column in the reference data containing gene names; when no gene_name_layer is provided, the var index will be used.
- name: Model arguments
description: Model arguments.
arguments:
- name: "--model"
type: file
description: "Pretrained model in pkl format. If not provided, the model will be trained on the reference data and --reference should be provided."
required: false
example: pretrained_model.pkl
- name: "--feature_selection"
type: boolean
description: "Whether to perform feature selection."
default: false
- name: "--majority_voting"
type: boolean
description: "Whether to refine the predicted labels by running the majority voting classifier after over-clustering."
default: false
- name: "--C"
type: double
description: "Inverse of regularization strength in logistic regression."
default: 1.0
- name: "--max_iter"
type: integer
description: "Maximum number of iterations before reaching the minimum of the cost function."
default: 1000
- name: "--use_SGD"
type: boolean_true
description: "Whether to use the stochastic gradient descent algorithm."
- name: "--min_prop"
type: double
description: |
"For the dominant cell type within a subcluster, the minimum proportion of cells required to
support naming of the subcluster by this cell type. Ignored if majority_voting is set to False.
Subcluster that fails to pass this proportion threshold will be assigned 'Heterogeneous'."
default: 0
- name: Outputs
description: Output arguments.
arguments:
- name: "--output"
type: file
description: Output h5mu file.
direction: output
example: output.h5mu
- name: "--output_obs_predictions"
type: string
default: celltypist_pred
required: false
description: |
In which `.obs` slots to store the predicted information.
- name: "--output_obs_probability"
type: string
default: celltypist_probability
required: false
description: |
In which `.obs` slots to store the probability of the predictions.
__merge__: [., /src/base/h5_compression_argument.yaml]
resources:
- type: python_script
path: script.py
- path: /src/utils/setup_logger.py
- path: /src/utils/cross_check_genes.py
- path: /src/utils/subset_vars.py
- path: /src/utils/set_var_index.py
- path: /src/utils/is_lognormalized.py
test_resources:
- type: python_script
path: test.py
- path: /resources_test/annotation_test_data/
- path: /resources_test/pbmc_1k_protein_v3/
engines:
- type: docker
image: nvcr.io/nvidia/pytorch:25.11-py3
setup:
- type: python
packages:
- celltypist==1.7.1
- type: python
__merge__: [ /src/base/requirements/anndata_mudata.yaml, .]
test_setup:
- type: python
__merge__: [ /src/base/requirements/scanpy.yaml, .]
__merge__: [ /src/base/requirements/python_test_setup.yaml, .]
runners:
- type: executable
- type: nextflow
directives:
label: [highcpu, highmem, highdisk, gpu]