name: celltypist namespace: annotate scope: "public" description: Automated cell type annotation tool for scRNA-seq datasets on the basis of logistic regression classifiers optimised by the stochastic gradient descent algorithm. authors: - __merge__: /src/authors/jakub_majercik.yaml roles: [ author ] - __merge__: /src/authors/weiwei_schultz.yaml roles: [ contributor ] argument_groups: - name: Inputs description: Input dataset (query) arguments arguments: - name: "--input" alternatives: [-i] type: file description: The input (query) data to be labeled. Should be a .h5mu file. direction: input required: true example: input.h5mu - name: "--modality" description: Which modality to process. type: string default: "rna" required: false - name: "--input_layer" type: string default: log_normalized description: The layer in the input data containing counts that are lognormalized to 10000, .X is not to be used. - name: "--input_var_gene_names" type: string required: false description: | The name of the adata var column in the input data containing gene names; when no gene_name_layer is provided, the var index will be used. - name: "--input_reference_gene_overlap" type: integer default: 100 min: 1 description: | The minimum number of genes present in both the reference and query datasets. - name: "--sanitize_ensembl_ids" type: boolean description: Whether to sanitize ensembl ids by removing version numbers. default: true - name: Reference description: Arguments related to the reference dataset. arguments: - name: "--reference" type: file description: "The reference data to train the CellTypist classifiers on. Only required if a pre-trained --model is not provided." example: reference.h5mu direction: input required: false - name: "--reference_layer" type: string description: The layer in the reference data containing counts that are lognormalized to 10000, if .X is not to be used. required: false - name: "--reference_obs_target" type: string description: The name of the adata obs column in the reference data containing cell type annotations. default: "cell_ontology_class" - name: "--reference_var_input" type: string default: "filter_with_hvg" required: false description: | .var column containing highly variable genes. If not provided, genes will not be subset. - name: "--reference_var_gene_names" type: string required: false description: | The name of the adata var column in the reference data containing gene names; when no gene_name_layer is provided, the var index will be used. - name: Model arguments description: Model arguments. arguments: - name: "--model" type: file description: "Pretrained model in pkl format. If not provided, the model will be trained on the reference data and --reference should be provided." required: false example: pretrained_model.pkl - name: "--feature_selection" type: boolean description: "Whether to perform feature selection." default: false - name: "--majority_voting" type: boolean description: "Whether to refine the predicted labels by running the majority voting classifier after over-clustering." default: false - name: "--C" type: double description: "Inverse of regularization strength in logistic regression." default: 1.0 - name: "--max_iter" type: integer description: "Maximum number of iterations before reaching the minimum of the cost function." default: 1000 - name: "--use_SGD" type: boolean_true description: "Whether to use the stochastic gradient descent algorithm." - name: "--min_prop" type: double description: | "For the dominant cell type within a subcluster, the minimum proportion of cells required to support naming of the subcluster by this cell type. Ignored if majority_voting is set to False. Subcluster that fails to pass this proportion threshold will be assigned 'Heterogeneous'." default: 0 - name: Outputs description: Output arguments. arguments: - name: "--output" type: file description: Output h5mu file. direction: output example: output.h5mu - name: "--output_obs_predictions" type: string default: celltypist_pred required: false description: | In which `.obs` slots to store the predicted information. - name: "--output_obs_probability" type: string default: celltypist_probability required: false description: | In which `.obs` slots to store the probability of the predictions. __merge__: [., /src/base/h5_compression_argument.yaml] resources: - type: python_script path: script.py - path: /src/utils/setup_logger.py - path: /src/utils/cross_check_genes.py - path: /src/utils/subset_vars.py - path: /src/utils/set_var_index.py - path: /src/utils/is_lognormalized.py test_resources: - type: python_script path: test.py - path: /resources_test/annotation_test_data/ - path: /resources_test/pbmc_1k_protein_v3/ engines: - type: docker image: nvcr.io/nvidia/pytorch:25.11-py3 setup: - type: python packages: - celltypist==1.7.1 - type: python __merge__: [ /src/base/requirements/anndata_mudata.yaml, .] test_setup: - type: python __merge__: [ /src/base/requirements/scanpy.yaml, .] __merge__: [ /src/base/requirements/python_test_setup.yaml, .] runners: - type: executable - type: nextflow directives: label: [highcpu, highmem, highdisk, gpu]