Build pipeline: openpipelines-bio.openpipeline.processing-workflows-quantijd49m
Source commit: 38f5f83d11
Source message: fix tests
166 lines
6.0 KiB
YAML
166 lines
6.0 KiB
YAML
name: celltypist
|
|
namespace: annotate
|
|
scope: "public"
|
|
description: Automated cell type annotation tool for scRNA-seq datasets on the basis of logistic regression classifiers optimised by the stochastic gradient descent algorithm.
|
|
authors:
|
|
- __merge__: /src/authors/jakub_majercik.yaml
|
|
roles: [ author ]
|
|
- __merge__: /src/authors/weiwei_schultz.yaml
|
|
roles: [ contributor ]
|
|
|
|
argument_groups:
|
|
- name: Inputs
|
|
description: Input dataset (query) arguments
|
|
arguments:
|
|
- name: "--input"
|
|
alternatives: [-i]
|
|
type: file
|
|
description: The input (query) data to be labeled. Should be a .h5mu file.
|
|
direction: input
|
|
required: true
|
|
example: input.h5mu
|
|
- name: "--modality"
|
|
description: Which modality to process.
|
|
type: string
|
|
default: "rna"
|
|
required: false
|
|
- name: "--input_layer"
|
|
type: string
|
|
default: log_normalized
|
|
description: The layer in the input data containing counts that are lognormalized to 10000, .X is not to be used.
|
|
- name: "--input_var_gene_names"
|
|
type: string
|
|
required: false
|
|
description: |
|
|
The name of the adata var column in the input data containing gene names; when no gene_name_layer is provided, the var index will be used.
|
|
- name: "--input_reference_gene_overlap"
|
|
type: integer
|
|
default: 100
|
|
min: 1
|
|
description: |
|
|
The minimum number of genes present in both the reference and query datasets.
|
|
- name: "--sanitize_ensembl_ids"
|
|
type: boolean
|
|
description: Whether to sanitize ensembl ids by removing version numbers.
|
|
default: true
|
|
|
|
- name: Reference
|
|
description: Arguments related to the reference dataset.
|
|
arguments:
|
|
- name: "--reference"
|
|
type: file
|
|
description: "The reference data to train the CellTypist classifiers on. Only required if a pre-trained --model is not provided."
|
|
example: reference.h5mu
|
|
direction: input
|
|
required: false
|
|
- name: "--reference_layer"
|
|
type: string
|
|
description: The layer in the reference data containing counts that are lognormalized to 10000, if .X is not to be used.
|
|
required: false
|
|
- name: "--reference_obs_target"
|
|
type: string
|
|
description: The name of the adata obs column in the reference data containing cell type annotations.
|
|
default: "cell_ontology_class"
|
|
- name: "--reference_var_input"
|
|
type: string
|
|
default: "filter_with_hvg"
|
|
required: false
|
|
description: |
|
|
.var column containing highly variable genes. If not provided, genes will not be subset.
|
|
- name: "--reference_var_gene_names"
|
|
type: string
|
|
required: false
|
|
description: |
|
|
The name of the adata var column in the reference data containing gene names; when no gene_name_layer is provided, the var index will be used.
|
|
|
|
- name: Model arguments
|
|
description: Model arguments.
|
|
arguments:
|
|
- name: "--model"
|
|
type: file
|
|
description: "Pretrained model in pkl format. If not provided, the model will be trained on the reference data and --reference should be provided."
|
|
required: false
|
|
example: pretrained_model.pkl
|
|
- name: "--feature_selection"
|
|
type: boolean
|
|
description: "Whether to perform feature selection."
|
|
default: false
|
|
- name: "--majority_voting"
|
|
type: boolean
|
|
description: "Whether to refine the predicted labels by running the majority voting classifier after over-clustering."
|
|
default: false
|
|
- name: "--C"
|
|
type: double
|
|
description: "Inverse of regularization strength in logistic regression."
|
|
default: 1.0
|
|
- name: "--max_iter"
|
|
type: integer
|
|
description: "Maximum number of iterations before reaching the minimum of the cost function."
|
|
default: 1000
|
|
- name: "--use_SGD"
|
|
type: boolean_true
|
|
description: "Whether to use the stochastic gradient descent algorithm."
|
|
- name: "--min_prop"
|
|
type: double
|
|
description: |
|
|
"For the dominant cell type within a subcluster, the minimum proportion of cells required to
|
|
support naming of the subcluster by this cell type. Ignored if majority_voting is set to False.
|
|
Subcluster that fails to pass this proportion threshold will be assigned 'Heterogeneous'."
|
|
default: 0
|
|
|
|
- name: Outputs
|
|
description: Output arguments.
|
|
arguments:
|
|
- name: "--output"
|
|
type: file
|
|
description: Output h5mu file.
|
|
direction: output
|
|
example: output.h5mu
|
|
- name: "--output_obs_predictions"
|
|
type: string
|
|
default: celltypist_pred
|
|
required: false
|
|
description: |
|
|
In which `.obs` slots to store the predicted information.
|
|
- name: "--output_obs_probability"
|
|
type: string
|
|
default: celltypist_probability
|
|
required: false
|
|
description: |
|
|
In which `.obs` slots to store the probability of the predictions.
|
|
__merge__: [., /src/base/h5_compression_argument.yaml]
|
|
|
|
resources:
|
|
- type: python_script
|
|
path: script.py
|
|
- path: /src/utils/setup_logger.py
|
|
- path: /src/utils/cross_check_genes.py
|
|
- path: /src/utils/subset_vars.py
|
|
- path: /src/utils/set_var_index.py
|
|
- path: /src/utils/is_lognormalized.py
|
|
|
|
test_resources:
|
|
- type: python_script
|
|
path: test.py
|
|
- path: /resources_test/annotation_test_data/
|
|
- path: /resources_test/pbmc_1k_protein_v3/
|
|
|
|
engines:
|
|
- type: docker
|
|
image: nvcr.io/nvidia/pytorch:25.11-py3
|
|
setup:
|
|
- type: python
|
|
packages:
|
|
- celltypist==1.7.1
|
|
- type: python
|
|
__merge__: [ /src/base/requirements/anndata_mudata.yaml, .]
|
|
test_setup:
|
|
- type: python
|
|
__merge__: [ /src/base/requirements/scanpy.yaml, .]
|
|
__merge__: [ /src/base/requirements/python_test_setup.yaml, .]
|
|
runners:
|
|
- type: executable
|
|
- type: nextflow
|
|
directives:
|
|
label: [highcpu, highmem, highdisk, gpu]
|