Files
openpipeline/target/nextflow/scgpt/pad_tokenize/nextflow_schema.json
CI bb7533583f Build branch fix-integration-tests with version fix-integration-tests (da62b4ff)
Build pipeline: vsh-ci-dev-gckj5

Source commit: da62b4ffe3

Source message: Add labels to qc_test component
2024-11-15 14:37:33 +00:00

229 lines
8.3 KiB
JSON

{
"$schema": "http://json-schema.org/draft-07/schema",
"title": "pad_tokenize",
"description": "Tokenize and pad a batch of data for scGPT integration zero-shot inference or fine-tuning.\n",
"type": "object",
"definitions": {
"inputs" : {
"title": "Inputs",
"type": "object",
"description": "No description",
"properties": {
"input": {
"type":
"string",
"description": "Type: `file`, required, example: `input.h5mu`. The input h5mu file of pre-processed data",
"help_text": "Type: `file`, required, example: `input.h5mu`. The input h5mu file of pre-processed data.\n"
}
,
"modality": {
"type":
"string",
"description": "Type: `string`, default: `rna`. ",
"help_text": "Type: `string`, default: `rna`. "
,
"default": "rna"
}
,
"model_vocab": {
"type":
"string",
"description": "Type: `file`, required, example: `vocab.json`. Path to model vocabulary file",
"help_text": "Type: `file`, required, example: `vocab.json`. Path to model vocabulary file.\n"
}
,
"input_layer": {
"type":
"string",
"description": "Type: `string`, default: `binned`. The name of the layer to be padded and tokenized",
"help_text": "Type: `string`, default: `binned`. The name of the layer to be padded and tokenized.\n"
,
"default": "binned"
}
,
"var_gene_names": {
"type":
"string",
"description": "Type: `string`. The name of the ",
"help_text": "Type: `string`. The name of the .var column containing gene names. When no gene_name_layer is provided, the .var index will be used.\n"
}
}
},
"outputs" : {
"title": "Outputs",
"type": "object",
"description": "No description",
"properties": {
"output": {
"type":
"string",
"description": "Type: `file`, required, default: `$id.$key.output.h5mu`, example: `output.h5mu`. The output h5mu file containing obsm arrays for gene tokens, tokenized data and padding mask",
"help_text": "Type: `file`, required, default: `$id.$key.output.h5mu`, example: `output.h5mu`. The output h5mu file containing obsm arrays for gene tokens, tokenized data and padding mask.\n"
,
"default": "$id.$key.output.h5mu"
}
,
"output_compression": {
"type":
"string",
"description": "Type: `string`, example: `gzip`, choices: ``gzip`, `lzf``. The compression type for the output file",
"help_text": "Type: `string`, example: `gzip`, choices: ``gzip`, `lzf``. The compression type for the output file.\n",
"enum": ["gzip", "lzf"]
}
,
"obsm_gene_tokens": {
"type":
"string",
"description": "Type: `string`, default: `gene_id_tokens`, example: `values.pt`. The key of the ",
"help_text": "Type: `string`, default: `gene_id_tokens`, example: `values.pt`. The key of the .obsm array containing the gene token ids\n"
,
"default": "gene_id_tokens"
}
,
"obsm_tokenized_values": {
"type":
"string",
"description": "Type: `string`, default: `values_tokenized`. The key of the ",
"help_text": "Type: `string`, default: `values_tokenized`. The key of the .obsm array containing the count values of the tokenized genes\n"
,
"default": "values_tokenized"
}
,
"obsm_padding_mask": {
"type":
"string",
"description": "Type: `string`, default: `padding_mask`. The key of the ",
"help_text": "Type: `string`, default: `padding_mask`. The key of the .obsm array containing the padding mask.\n"
,
"default": "padding_mask"
}
}
},
"arguments" : {
"title": "Arguments",
"type": "object",
"description": "No description",
"properties": {
"pad_token": {
"type":
"string",
"description": "Type: `string`, default: `\u003cpad\u003e`. Token used for padding",
"help_text": "Type: `string`, default: `\u003cpad\u003e`. Token used for padding.\n"
,
"default": "<pad>"
}
,
"pad_value": {
"type":
"integer",
"description": "Type: `integer`, default: `-2`. The value of the padding token",
"help_text": "Type: `integer`, default: `-2`. The value of the padding token.\n"
,
"default": "-2"
}
,
"max_seq_len": {
"type":
"integer",
"description": "Type: `integer`. The maximum sequence length of the tokenized data",
"help_text": "Type: `integer`. The maximum sequence length of the tokenized data.\n"
}
}
},
"nextflow input-output arguments" : {
"title": "Nextflow input-output arguments",
"type": "object",
"description": "Input/output parameters for Nextflow itself. Please note that both publishDir and publish_dir are supported but at least one has to be configured.",
"properties": {
"publish_dir": {
"type":
"string",
"description": "Type: `string`, required, example: `output/`. Path to an output directory",
"help_text": "Type: `string`, required, example: `output/`. Path to an output directory."
}
,
"param_list": {
"type":
"string",
"description": "Type: `string`, example: `my_params.yaml`. Allows inputting multiple parameter sets to initialise a Nextflow channel",
"help_text": "Type: `string`, example: `my_params.yaml`. Allows inputting multiple parameter sets to initialise a Nextflow channel. A `param_list` can either be a list of maps, a csv file, a json file, a yaml file, or simply a yaml blob.\n\n* A list of maps (as-is) where the keys of each map corresponds to the arguments of the pipeline. Example: in a `nextflow.config` file: `param_list: [ [\u0027id\u0027: \u0027foo\u0027, \u0027input\u0027: \u0027foo.txt\u0027], [\u0027id\u0027: \u0027bar\u0027, \u0027input\u0027: \u0027bar.txt\u0027] ]`.\n* A csv file should have column names which correspond to the different arguments of this pipeline. Example: `--param_list data.csv` with columns `id,input`.\n* A json or a yaml file should be a list of maps, each of which has keys corresponding to the arguments of the pipeline. Example: `--param_list data.json` with contents `[ {\u0027id\u0027: \u0027foo\u0027, \u0027input\u0027: \u0027foo.txt\u0027}, {\u0027id\u0027: \u0027bar\u0027, \u0027input\u0027: \u0027bar.txt\u0027} ]`.\n* A yaml blob can also be passed directly as a string. Example: `--param_list \"[ {\u0027id\u0027: \u0027foo\u0027, \u0027input\u0027: \u0027foo.txt\u0027}, {\u0027id\u0027: \u0027bar\u0027, \u0027input\u0027: \u0027bar.txt\u0027} ]\"`.\n\nWhen passing a csv, json or yaml file, relative path names are relativized to the location of the parameter file. No relativation is performed when `param_list` is a list of maps (as-is) or a yaml blob.",
"hidden": true
}
}
}
},
"allOf": [
{
"$ref": "#/definitions/inputs"
},
{
"$ref": "#/definitions/outputs"
},
{
"$ref": "#/definitions/arguments"
},
{
"$ref": "#/definitions/nextflow input-output arguments"
}
]
}