Build pipeline: vsh-ci-dev-k8tz4
Source commit: 2dbe3b7231
Source message: Fix pointers to test resources
140 lines
5.1 KiB
YAML
140 lines
5.1 KiB
YAML
name: random_forest_annotation
|
|
namespace: annotate
|
|
description: Automated cell type annotation tool for scRNA-seq datasets on the basis of random forest.
|
|
authors:
|
|
- __merge__: /src/authors/jakub_majercik.yaml
|
|
roles: [ author ]
|
|
|
|
argument_groups:
|
|
- name: Inputs
|
|
description: Input dataset (query) arguments
|
|
arguments:
|
|
- name: "--input"
|
|
type: file
|
|
description: The input (query) data to be labeled. Should be a .h5mu file.
|
|
direction: input
|
|
required: true
|
|
example: input.h5mu
|
|
- name: "--modality"
|
|
description: Which modality to process.
|
|
type: string
|
|
default: "rna"
|
|
required: false
|
|
- name: "--input_layer"
|
|
type: string
|
|
description: The layer in the input data to be used for cell type annotation if .X is not to be used.
|
|
- name: Reference
|
|
description: Arguments related to the reference dataset.
|
|
arguments:
|
|
- name: "--reference"
|
|
type: file
|
|
description: "The reference data to train the CellTypist classifiers on. Only required if a pre-trained --model is not provided."
|
|
example: reference.h5mu
|
|
direction: input
|
|
required: false
|
|
- name: "--reference_layer"
|
|
type: string
|
|
description: The layer in the reference data to be used for cell type annotation if .X is not to be used. Data are expected to be processed in the same way as the --input query dataset.
|
|
required: false
|
|
- name: "--reference_obs_target"
|
|
type: string
|
|
description: Key in obs field of reference modality with cell-type information.
|
|
required: true
|
|
- name: Outputs
|
|
description: Output arguments.
|
|
arguments:
|
|
- name: "--output"
|
|
type: file
|
|
description: Output h5mu file.
|
|
direction: output
|
|
example: output.h5mu
|
|
- name: "--output_compression"
|
|
type: string
|
|
choices: ["gzip", "lzf"]
|
|
required: false
|
|
example: "gzip"
|
|
- name: "--output_obs_predictions"
|
|
type: string
|
|
default: random_forest_pred
|
|
required: false
|
|
description: |
|
|
In which `.obs` slots to store the predicted information.
|
|
- name: "--output_obs_probability"
|
|
type: string
|
|
default: random_forest_probability
|
|
required: false
|
|
description: |
|
|
In which `.obs` slots to store the probability of the predictions.
|
|
- name: Model arguments
|
|
description: Model arguments.
|
|
arguments:
|
|
- name: "--model"
|
|
type: file
|
|
description: "Pretrained model in pkl format. If not provided, the model will be trained on the reference data and --reference should be provided."
|
|
required: false
|
|
example: pretrained_model.pkl
|
|
- name: "--n_estimators"
|
|
type: integer
|
|
required: false
|
|
default: 100
|
|
description: Number of trees in the random forest.
|
|
- name: "--max_depth"
|
|
type: integer
|
|
required: false
|
|
description: |
|
|
Maximum depth of the trees in the random forest.
|
|
If not provided, the nodes are expanded until all leaves only contain a single sample.
|
|
- name: "--criterion"
|
|
type: string
|
|
required: false
|
|
choices: ["gini", "entropy", "log_loss"]
|
|
default: "gini"
|
|
description: The function to measure the quality of a split.
|
|
- name: "--class_weight"
|
|
type: string
|
|
required: false
|
|
default: "balanced_subsample"
|
|
choices: ["balanced", "balanced_subsample", "uniform"]
|
|
description: |
|
|
Weights associated with classes.
|
|
The `balanced` mode uses the values of y to automatically adjust weights inversely proportional to class frequencies in the input data.
|
|
The `balanced_subsample` mode is the same as `balanced` except that weights are computed based on the bootstrap sample for every tree grown.
|
|
The `uniform` mode gives all classes a weight of one.
|
|
- name: "--max_features"
|
|
type: string
|
|
default: "200"
|
|
description: |
|
|
The number of features to consider when looking for the best split. The value can either be a positive integer or one of `sqrt`, `log2` or `all`.
|
|
If integer: consider max_features features at each split.
|
|
If `sqrt`: max_features is the squareroot of all input features.
|
|
If `log2`: max_features is the log2 of all input features.
|
|
If `all`: max features equals all input features.
|
|
|
|
resources:
|
|
- type: python_script
|
|
path: script.py
|
|
- path: /src/utils/setup_logger.py
|
|
|
|
test_resources:
|
|
- type: python_script
|
|
path: test.py
|
|
- path: /resources_test/annotation_test_data/TS_Blood_filtered.h5mu
|
|
- path: /resources_test/pbmc_1k_protein_v3/pbmc_1k_protein_v3_filtered_feature_bc_matrix.h5mu
|
|
|
|
engines:
|
|
- type: docker
|
|
image: python:3.12-slim
|
|
setup:
|
|
- type: apt
|
|
packages:
|
|
- libhdf5-dev
|
|
- procps
|
|
- type: python
|
|
packages:
|
|
- scikit-learn==1.4.2
|
|
- type: python
|
|
__merge__: [ /src/base/requirements/anndata_mudata.yaml, .]
|
|
__merge__: [ /src/base/requirements/python_test_setup.yaml, .]
|
|
runners:
|
|
- type: executable
|
|
- type: nextflow |