name: "spatial_process_samples"
namespace: "workflows/multiomics"
scope: "public"
description: "A pipeline to pre-process multiple spatial omics samples."
authors:
  - __merge__: /src/authors/dries_schaumont.yaml
    roles: [ author, maintainer ]
  - __merge__: /src/authors/dorien_roosen.yaml
    roles: [ contributor ]
  - __merge__: /src/authors/weiwei_schultz.yaml
    roles: [ contributor ]

argument_groups:
  - name: Inputs
    arguments:
      - name: "--id"
        required: true
        type: string
        description: ID of the sample.
        example: foo
      - name: "--input"
        alternatives: [-i]
        description: Path to the sample.
        required: true
        example: input.h5mu
        type: file
      - name: "--rna_layer"
        type: string
        description: "Input layer for the gene expression modality. If not specified, .X is used."
        required: false
      - name: "--prot_layer"
        type: string
        description: "Input layer for the antibody capture modality. If not specified, .X is used."
        required: false

  - name: "Outputs"
    arguments:
      - name: "--output"
        type: file
        required: true
        direction: output
        description: Destination path to the output.
        example: output.h5mu

  - name: "Sample ID options"
    description: |
      Options for adding the id to .obs on the MuData object. Having a sample 
      id present in a requirement of several components for this pipeline.
    arguments:
      - name: "--add_id_to_obs"
        description: "Add the value passed with --id to .obs."
        type: boolean
        default: true
      - name: --add_id_obs_output
        description: |
          .Obs column to add the sample IDs to. Required and only used when 
          --add_id_to_obs is set to 'true'
        type: string
        default: "sample_id"
      - name: "--add_id_make_observation_keys_unique"
        type: boolean
        description: |
          Join the id to the .obs index (.obs_names). 
          Only used when --add_id_to_obs is set to 'true'.
        default: true

  - name: "RNA filtering options"
    arguments:
      - name: "--rna_min_counts"
        example: 200
        min: 1
        type: integer
        description: Minimum number of counts captured per cell.
      - name: "--rna_max_counts"
        example: 5000000
        min: 1
        type: integer
        description: Maximum number of counts captured per cell.
      - name: "--rna_min_genes_per_cell"
        type: integer
        min: 1
        example: 200
        description: Minimum of non-zero values per cell.
      - name: "--rna_max_genes_per_cell"
        example: 1500000
        min: 1
        type: integer
        description: Maximum of non-zero values per cell.
      - name: "--rna_min_cells_per_gene"
        example: 3
        min: 1
        type: integer
        description: Minimum of non-zero values per gene.
      - name: "--rna_min_fraction_mito"
        example: 0
        min: 0
        max: 1
        type: double
        description: Minimum fraction of UMIs that are mitochondrial.
      - name: "--rna_max_fraction_mito"
        type: double
        min: 0
        max: 1
        example: 0.2
        description: Maximum fraction of UMIs that are mitochondrial.
      - name: "--rna_min_fraction_ribo"
        example: 0
        min: 0
        max: 1
        type: double
        description: Minimum fraction of UMIs that are mitochondrial.
      - name: "--rna_max_fraction_ribo"
        type: double
        min: 0
        max: 1
        example: 0.2
        description: Maximum fraction of UMIs that are mitochondrial.

  - name: "Protein filtering options"
    arguments:
      - name: "--prot_min_counts"
        description: Minimum number of counts per cell.
        type: integer
        min: 1
        example: 3
      - name: "--prot_max_counts"
        description: Minimum number of counts per cell.
        type: integer
        min: 1
        example: 5000000
      - name: "--prot_min_proteins_per_cell"
        type: integer
        min: 1
        example: 200
        description: Minimum of non-zero values per cell.
      - name: "--prot_max_proteins_per_cell"
        description: Maximum of non-zero values per cell.
        type: integer
        min: 1
        example: 100000000
      - name: "--prot_min_cells_per_protein"
        example: 3
        min: 1
        type: integer
        description: Minimum of non-zero values per protein.

  - name: "Highly variable features detection"
    arguments:
      - name: "--highly_variable_features_var_output"
        alternatives: ["--filter_with_hvg_var_output"]
        required: false
        type: string
        default: "filter_with_hvg"
        description: In which .var slot to store a boolean array corresponding to the highly variable genes.
      - name: "--highly_variable_features_obs_batch_key"
        alternatives: ["--filter_with_hvg_obs_batch_key"]
        type: string
        default: "sample_id"
        required: false
        description: |
          If specified, highly-variable genes are selected within each batch separately and merged. This simple 
          process avoids the selection of batch-specific genes and acts as a lightweight batch correction method.
  - name: "Mitochondrial & Ribosomal Gene Detection"
    arguments:
      - name: "--var_gene_names"
        required: false
        example: "gene_symbol"
        type: string
        description: |
          .var column name to be used to detect mitochondrial/ribosomal genes instead of .var_names (default if not set).
          Gene names matching with the regex value from --mitochondrial_gene_regex or --ribosomal_gene_regex will be 
          identified as mitochondrial or ribosomal genes, respectively. 
      - name: "--var_name_mitochondrial_genes"
        type: string
        required: false
        description: |
          In which .var slot to store a boolean array corresponding the mitochondrial genes.
      - name: "--obs_name_mitochondrial_fraction"
        type: string
        required: false
        description: |
          When specified, write the fraction of counts originating from mitochondrial genes 
          (based on --mitochondrial_gene_regex) to an .obs column with the specified name.
          Requires --var_name_mitochondrial_genes.
      - name: --mitochondrial_gene_regex
        type: string
        description: |
          Regex string that identifies mitochondrial genes from --var_gene_names.
          By default will detect human and mouse mitochondrial genes from a gene symbol.
        required: false
        default: "^[mM][tT]-"
      - name: "--var_name_ribosomal_genes"
        type: string
        required: false
        description: |
          In which .var slot to store a boolean array corresponding the ribosomal genes.
      - name: "--obs_name_ribosomal_fraction"
        type: string
        required: false
        description: |
          When specified, write the fraction of counts originating from ribosomal genes 
          (based on --ribosomal_gene_regex) to an .obs column with the specified name.
          Requires --var_name_ribosomal_genes.
      - name: --ribosomal_gene_regex
        type: string
        description: |
          Regex string that identifies ribosomal genes from --var_gene_names.
          By default will detect human and mouse ribosomal genes from a gene symbol.
        required: false
        default: "^[Mm]?[Rr][Pp][LlSs]"

  - name: "QC metrics calculation options"
    arguments:
      - name: "--var_qc_metrics"
        description: |
          Keys to select a boolean (containing only True or False) column from .var.
          For each cell, calculate the proportion of total values for genes which are labeled 'True', 
          compared to the total sum of the values for all genes. Defaults to the combined values specified for
          --var_name_mitochondrial_genes and --highly_variable_features_var_output.
        type: string
        multiple: True
        multiple_sep: ','
        required: false
        example: "ercc,highly_variable"
      - name: "--top_n_vars"
        type: integer
        description: |
          Number of top vars to be used to calculate cumulative proportions.
          If not specified, proportions are not calculated. `--top_n_vars 20,50` finds
          cumulative proportion to the 20th and 50th most expressed vars.
        multiple: true
        multiple_sep: ','
        required: false
        default: [50, 100, 200, 500]

  - name: "PCA options"
    arguments:
      - name: "--pca_overwrite"
        type: boolean_true
        description: "Allow overwriting slots for PCA output."

  - name: "CLR options"
    arguments:
      - name: "--clr_axis"
        type: integer
        description: "Axis to perform the CLR transformation on."
        default: 0
        required: false

  - name: "RNA Scaling options"
    description: |
      Options for enabling scaling of the log-normalized data to unit variance and zero mean.
      The scaled data will be output a different layer and representation with reduced dimensions
      will be created and stored in addition to the non-scaled data.
    arguments:
      - name: "--rna_enable_scaling"
        description: "Enable scaling for the RNA modality."
        type: boolean_true
      - name: "--rna_scaling_output_layer"
        type: string
        default: "scaled"
        description: "Output layer where the scaled log-normalized data will be stored."
      - name: "--rna_scaling_pca_obsm_output"
        type: string
        description: |
          Name of the .obsm key where the PCA representation of the log-normalized
          and scaled data is stored.
        default: "scaled_pca"
      - name: "--rna_scaling_pca_loadings_varm_output"
        type: string
        description: |
          Name of the .varm key where the PCA loadings of the log-normalized and scaled
          data is stored.
        default: "scaled_pca_loadings"
      - name: "--rna_scaling_pca_variance_uns_output"
        type: string
        description: |
          Name of the .uns key where the variance and variance ratio will be stored as a map.
          The map will contain two keys: variance and variance_ratio respectively.
        default: "scaled_pca_variance"
      - name: "--rna_scaling_umap_obsm_output"
        type: string
        description:
          Name of the .obsm key where the UMAP representation of the log-normalized
          and scaled data is stored.
        default: "scaled_umap"
      - name: "--rna_scaling_max_value"
        description: "Clip (truncate) data to this value after scaling. If not specified, do not clip."
        required: false
        type: double
      - name: "--rna_scaling_zero_center"
        type: boolean_false
        description: If set, omit zero-centering variables, which allows to handle sparse input efficiently."

dependencies:
  - name: workflows/multiomics/process_samples
    alias: spatial_sample_processing
    repository: openpipeline

resources:
  - type: nextflow_script
    path: main.nf
    entrypoint: run_wf

test_resources:
  - type: nextflow_script
    path: test.nf
    entrypoint: test_wf
  - path: /resources_test/xenium/xenium_tiny.h5mu

runners:
  - type: nextflow