name: celltypist
namespace: annotate
scope: "public"
description: Automated cell type annotation tool for scRNA-seq datasets on the basis of logistic regression classifiers optimised by the stochastic gradient descent algorithm.
authors:
  - __merge__: /src/authors/jakub_majercik.yaml
    roles: [ author ]
  - __merge__: /src/authors/weiwei_schultz.yaml
    roles: [ contributor ]

argument_groups:
  - name: Inputs
    description: Input dataset (query) arguments
    arguments:
      - name: "--input"
        alternatives: [-i]
        type: file
        description: The input (query) data to be labeled. Should be a .h5mu file.
        direction: input
        required: true
        example: input.h5mu
      - name: "--modality"
        description: Which modality to process.
        type: string
        default: "rna"
        required: false
      - name: "--input_layer"
        type: string
        default: log_normalized
        description: The layer in the input data containing counts that are lognormalized to 10000, .X is not to be used. 
      - name: "--input_var_gene_names"
        type: string
        required: false
        description: |
          The name of the adata var column in the input data containing gene names; when no gene_name_layer is provided, the var index will be used.
      - name: "--input_reference_gene_overlap"
        type: integer
        default: 100
        min: 1
        description: | 
          The minimum number of genes present in both the reference and query datasets.
      - name: "--sanitize_ensembl_ids"
        type: boolean
        description: Whether to sanitize ensembl ids by removing version numbers.
        default: true

  - name: Reference
    description: Arguments related to the reference dataset.
    arguments:
      - name: "--reference"
        type: file
        description: "The reference data to train the CellTypist classifiers on. Only required if a pre-trained --model is not provided."
        example: reference.h5mu
        direction: input
        required: false
      - name: "--reference_layer"
        type: string
        description: The layer in the reference data containing counts that are lognormalized to 10000, if .X is not to be used.
        required: false
      - name: "--reference_obs_target"
        type: string
        description: The name of the adata obs column in the reference data containing cell type annotations.
        default: "cell_ontology_class"
      - name: "--reference_var_input"
        type: string
        default: "filter_with_hvg"
        required: false
        description: |
          .var column containing highly variable genes. If not provided, genes will not be subset.
      - name: "--reference_var_gene_names"
        type: string
        required: false
        description: |
          The name of the adata var column in the reference data containing gene names; when no gene_name_layer is provided, the var index will be used.

  - name: Model arguments
    description: Model arguments.
    arguments:
      - name: "--model"
        type: file
        description: "Pretrained model in pkl format. If not provided, the model will be trained on the reference data and --reference should be provided."
        required: false
        example: pretrained_model.pkl
      - name: "--feature_selection"
        type: boolean
        description: "Whether to perform feature selection."
        default: false
      - name: "--majority_voting"
        type: boolean
        description: "Whether to refine the predicted labels by running the majority voting classifier after over-clustering."
        default: false
      - name: "--C"
        type: double
        description: "Inverse of regularization strength in logistic regression."
        default: 1.0
      - name: "--max_iter"
        type: integer
        description: "Maximum number of iterations before reaching the minimum of the cost function."
        default: 1000
      - name: "--use_SGD"
        type: boolean_true
        description: "Whether to use the stochastic gradient descent algorithm."
      - name: "--min_prop"
        type: double
        description: |
          "For the dominant cell type within a subcluster, the minimum proportion of cells required to 
          support naming of the subcluster by this cell type. Ignored if majority_voting is set to False. 
          Subcluster that fails to pass this proportion threshold will be assigned 'Heterogeneous'."
        default: 0

  - name: Outputs
    description: Output arguments.
    arguments:
      - name: "--output"
        type: file
        description: Output h5mu file.
        direction: output
        example: output.h5mu
      - name: "--output_obs_predictions"
        type: string
        default: celltypist_pred
        required: false
        description: |
          In which `.obs` slots to store the predicted information.
      - name: "--output_obs_probability"
        type: string
        default: celltypist_probability
        required: false
        description: |
          In which `.obs` slots to store the probability of the predictions.
    __merge__: [., /src/base/h5_compression_argument.yaml]

resources:
  - type: python_script
    path: script.py
  - path: /src/utils/setup_logger.py
  - path: /src/utils/cross_check_genes.py
  - path: /src/utils/subset_vars.py
  - path: /src/utils/set_var_index.py
  - path: /src/utils/is_lognormalized.py

test_resources:
  - type: python_script
    path: test.py
  - path: /resources_test/annotation_test_data/
  - path: /resources_test/pbmc_1k_protein_v3/

engines:
  - type: docker
    image: nvcr.io/nvidia/pytorch:25.11-py3
    setup:
      - type: python
        packages:
          - celltypist==1.7.1
      - type: python
        __merge__: [ /src/base/requirements/anndata_mudata.yaml, .]
    test_setup:
      - type: python
        __merge__: [ /src/base/requirements/scanpy.yaml, .]
    __merge__: [ /src/base/requirements/python_test_setup.yaml, .]
runners:
  - type: executable
  - type: nextflow
    directives:
      label: [highcpu, highmem, highdisk, gpu]