Files
biobox/target/nextflow/umi_tools/umi_tools_dedup/nextflow_schema.json
CI b833613f7c Build branch main with version main (923a6da)
Build pipeline: viash-hub.biobox.main-dvp9l

Source commit: 923a6da389

Source message: Bug Fixed (#136)
2024-08-14 21:18:23 +00:00

636 lines
26 KiB
JSON

{
"$schema": "http://json-schema.org/draft-07/schema",
"title": "umi_tools_dedup",
"description": "Deduplicate reads based on the mapping co-ordinate and the UMI attached to the read.\n",
"type": "object",
"definitions": {
"inputs" : {
"title": "Inputs",
"type": "object",
"description": "No description",
"properties": {
"input": {
"type":
"string",
"description": "Type: `file`, required. Input BAM or SAM file",
"help_text": "Type: `file`, required. Input BAM or SAM file. Use --in_sam to specify SAM format."
}
,
"in_sam": {
"type":
"boolean",
"description": "Type: `boolean_true`, default: `false`. By default, inputs are assumed to be in BAM format",
"help_text": "Type: `boolean_true`, default: `false`. By default, inputs are assumed to be in BAM format. Use this options to specify the use of SAM\nformat for input.\n"
,
"default": "False"
}
,
"bai": {
"type":
"string",
"description": "Type: `file`. BAM index",
"help_text": "Type: `file`. BAM index"
}
,
"random_seed": {
"type":
"integer",
"description": "Type: `integer`. Random seed to initialize number generator with",
"help_text": "Type: `integer`. Random seed to initialize number generator with."
}
}
},
"outputs" : {
"title": "Outputs",
"type": "object",
"description": "No description",
"properties": {
"output": {
"type":
"string",
"description": "Type: `file`, required, default: `$id.$key.output.output`. Deduplicated BAM file",
"help_text": "Type: `file`, required, default: `$id.$key.output.output`. Deduplicated BAM file."
,
"default": "$id.$key.output.output"
}
,
"out_sam": {
"type":
"boolean",
"description": "Type: `boolean_true`, default: `false`. By default, outputa are written in BAM format",
"help_text": "Type: `boolean_true`, default: `false`. By default, outputa are written in BAM format. Use this options to specify the use of SAM format\nfor output.\n"
,
"default": "False"
}
,
"paired": {
"type":
"boolean",
"description": "Type: `boolean_true`, default: `false`. BAM is paired end - output both read pairs",
"help_text": "Type: `boolean_true`, default: `false`. BAM is paired end - output both read pairs. This will also force the use of the template length\nto determine reads with the same mapping coordinates.\n"
,
"default": "False"
}
,
"output_stats": {
"type":
"string",
"description": "Type: `string`. Generate files containing UMI based deduplication statistics files with this prefix in the file names",
"help_text": "Type: `string`. Generate files containing UMI based deduplication statistics files with this prefix in the file names.\n"
}
,
"extract_umi_method": {
"type":
"string",
"description": "Type: `string`, example: `read_id`, choices: ``read_id`, `tag`, `umis``. Specify the method by which the barcodes were encoded in the read",
"help_text": "Type: `string`, example: `read_id`, choices: ``read_id`, `tag`, `umis``. Specify the method by which the barcodes were encoded in the read.\nThe options are:\n * read_id (default) \n * tag\n * umis\n",
"enum": ["read_id", "tag", "umis"]
}
,
"umi_tag": {
"type":
"string",
"description": "Type: `string`. The tag containing the UMI sequence",
"help_text": "Type: `string`. The tag containing the UMI sequence. This is only required if the extract_umi_method is set to tag.\n"
}
,
"umi_separator": {
"type":
"string",
"description": "Type: `string`, example: `_`. The separator used to separate the UMI from the read sequence",
"help_text": "Type: `string`, example: `_`. The separator used to separate the UMI from the read sequence. This is only required if the\nextract_umi_method is set to id_read. Default: `_`.\n"
}
,
"umi_tag_split": {
"type":
"string",
"description": "Type: `string`. Separate the UMI in tag by \u003cSPLIT\u003e and take the first element",
"help_text": "Type: `string`. Separate the UMI in tag by \u003cSPLIT\u003e and take the first element."
}
,
"umi_tag_delimiter": {
"type":
"string",
"description": "Type: `string`. Separate the UMI in by \u003cDELIMITER\u003e and concatenate the elements",
"help_text": "Type: `string`. Separate the UMI in by \u003cDELIMITER\u003e and concatenate the elements."
}
,
"cell_tag": {
"type":
"string",
"description": "Type: `string`. The tag containing the cell barcode sequence",
"help_text": "Type: `string`. The tag containing the cell barcode sequence. This is only required if the extract_umi_method\nis set to tag.\n"
}
,
"cell_tag_split": {
"type":
"string",
"description": "Type: `string`. Separate the cell barcode in tag by \u003cSPLIT\u003e and take the first element",
"help_text": "Type: `string`. Separate the cell barcode in tag by \u003cSPLIT\u003e and take the first element."
}
,
"cell_tag_delimiter": {
"type":
"string",
"description": "Type: `string`. Separate the cell barcode in by \u003cDELIMITER\u003e and concatenate the elements",
"help_text": "Type: `string`. Separate the cell barcode in by \u003cDELIMITER\u003e and concatenate the elements."
}
}
},
"grouping options" : {
"title": "Grouping Options",
"type": "object",
"description": "No description",
"properties": {
"method": {
"type":
"string",
"description": "Type: `string`, example: `directional`, choices: ``unique`, `percentile`, `cluster`, `adjacency`, `directional``. The method to use for grouping reads",
"help_text": "Type: `string`, example: `directional`, choices: ``unique`, `percentile`, `cluster`, `adjacency`, `directional``. The method to use for grouping reads. \nThe options are: \n * unique\n * percentile\n * cluster\n * adjacency\n * directional (default)\n",
"enum": ["unique", "percentile", "cluster", "adjacency", "directional"]
}
,
"edit_distance_threshold": {
"type":
"integer",
"description": "Type: `integer`, example: `1`. For the adjacency and cluster methods the threshold for the edit distance to connect two\nUMIs in the network can be increased",
"help_text": "Type: `integer`, example: `1`. For the adjacency and cluster methods the threshold for the edit distance to connect two\nUMIs in the network can be increased. The default value of 1 works best unless the UMI is\nvery long (\u003e14bp). Default: `1`.\n"
}
,
"spliced_is_unique": {
"type":
"boolean",
"description": "Type: `boolean_true`, default: `false`. Causes two reads that start in the same position on the same strand and having the same UMI\nto be considered unique if one is spliced and the other is not",
"help_text": "Type: `boolean_true`, default: `false`. Causes two reads that start in the same position on the same strand and having the same UMI\nto be considered unique if one is spliced and the other is not. (Uses the \u0027N\u0027 cigar operation\nto test for splicing).\n"
,
"default": "False"
}
,
"soft_clip_threshold": {
"type":
"integer",
"description": "Type: `integer`, example: `4`. Mappers that soft clip will sometimes do so rather than mapping a spliced read if there is only\na small overhang over the exon junction",
"help_text": "Type: `integer`, example: `4`. Mappers that soft clip will sometimes do so rather than mapping a spliced read if there is only\na small overhang over the exon junction. By setting this option, you can treat reads with at\nleast this many bases soft-clipped at the 3\u0027 end as spliced. Default: `4`.\n"
}
,
"multimapping_detection_method": {
"type":
"string",
"description": "Type: `string`. If the sam/bam contains tags to identify multimapping reads, you can specify for use when selecting\nthe best read at a given loci",
"help_text": "Type: `string`. If the sam/bam contains tags to identify multimapping reads, you can specify for use when selecting\nthe best read at a given loci. Supported tags are `NH`, `X0` and `XT`. If not specified, the read\nwith the highest mapping quality will be selected.\n"
}
,
"read_length": {
"type":
"boolean",
"description": "Type: `boolean_true`, default: `false`. Use the read length as a criteria when deduping, for e",
"help_text": "Type: `boolean_true`, default: `false`. Use the read length as a criteria when deduping, for e.g. sRNA-Seq."
,
"default": "False"
}
}
},
"single-cell rna-seq options" : {
"title": "Single-cell RNA-Seq Options",
"type": "object",
"description": "No description",
"properties": {
"per_gene": {
"type":
"boolean",
"description": "Type: `boolean_true`, default: `false`. Reads will be grouped together if they have the same gene",
"help_text": "Type: `boolean_true`, default: `false`. Reads will be grouped together if they have the same gene. This is useful if your library prep\ngenerates PCR duplicates with non identical alignment positions such as CEL-Seq. Note this option\nis hardcoded to be on with the count command. I.e. counting is always performed per-gene. Must be\ncombined with either --gene_tag or --per_contig option.\n"
,
"default": "False"
}
,
"gene_tag": {
"type":
"string",
"description": "Type: `string`. Deduplicate per gene",
"help_text": "Type: `string`. Deduplicate per gene. The gene information is encoded in the bam read tag specified.\n"
}
,
"assigned_status_tag": {
"type":
"string",
"description": "Type: `string`. BAM tag which describes whether a read is assigned to a gene",
"help_text": "Type: `string`. BAM tag which describes whether a read is assigned to a gene. Defaults to the same value as given\nfor --gene_tag.\n"
}
,
"skip_tags_regex": {
"type":
"string",
"description": "Type: `string`. Use in conjunction with the --assigned_status_tag option to skip any reads where the tag matches\nthis regex",
"help_text": "Type: `string`. Use in conjunction with the --assigned_status_tag option to skip any reads where the tag matches\nthis regex. Default (\"^[__|Unassigned]\") matches anything which starts with \"__\" or \"Unassigned\".\n"
}
,
"per_contig": {
"type":
"boolean",
"description": "Type: `boolean_true`, default: `false`. Deduplicate per contig (field 3 in BAM; RNAME)",
"help_text": "Type: `boolean_true`, default: `false`. Deduplicate per contig (field 3 in BAM; RNAME). All reads with the sam contig will be considered to\nhave the same alignment position. This is useful if you have aligned to a reference transcriptome\nwith one transcript per gene. If you have aligned to a transcriptome with more than one transcript\nper gene, you can supply a map between transcripts and gene using the --gene_transcript_map option.\n"
,
"default": "False"
}
,
"gene_transcript_map": {
"type":
"string",
"description": "Type: `file`. A file containing a mapping between gene names and transcript names",
"help_text": "Type: `file`. A file containing a mapping between gene names and transcript names. The file should be tab\nseparated with the gene name in the first column and the transcript name in the second column.\n"
}
,
"per_cell": {
"type":
"boolean",
"description": "Type: `boolean_true`, default: `false`. Reads will only be grouped together if they have the same cell barcode",
"help_text": "Type: `boolean_true`, default: `false`. Reads will only be grouped together if they have the same cell barcode. Can be combined with\n--per_gene.\n"
,
"default": "False"
}
}
},
"sam/bam options" : {
"title": "SAM/BAM Options",
"type": "object",
"description": "No description",
"properties": {
"mapping_quality": {
"type":
"integer",
"description": "Type: `integer`, example: `0`. Minimium mapping quality (MAPQ) for a read to be retained",
"help_text": "Type: `integer`, example: `0`. Minimium mapping quality (MAPQ) for a read to be retained. Default: `0`.\n"
}
,
"unmapped_reads": {
"type":
"string",
"description": "Type: `string`, example: `discard`. How unmapped reads should be handled",
"help_text": "Type: `string`, example: `discard`. How unmapped reads should be handled. \nThe options are:\n * \"discard\": Discard all unmapped reads. (default)\n * \"use\": If read2 is unmapped, deduplicate using read1 only. Requires --paired.\n * \"output\": Output unmapped reads/read pairs without UMI grouping/deduplication. Only available in umi_tools group.\n"
}
,
"chimeric_pairs": {
"type":
"string",
"description": "Type: `string`, example: `use`, choices: ``discard`, `use`, `output``. How chimeric pairs should be handled",
"help_text": "Type: `string`, example: `use`, choices: ``discard`, `use`, `output``. How chimeric pairs should be handled. \nThe options are:\n * \"discard\": Discard all chimeric read pairs.\n * \"use\": Deduplicate using read1 only. (default)\n * \"output\": Output chimeric pairs without UMI grouping/deduplication. Only available in\n umi_tools group.\n",
"enum": ["discard", "use", "output"]
}
,
"unpaired_reads": {
"type":
"string",
"description": "Type: `string`, example: `use`, choices: ``discard`, `use`, `output``. How unpaired reads should be handled",
"help_text": "Type: `string`, example: `use`, choices: ``discard`, `use`, `output``. How unpaired reads should be handled. \nThe options are: \n * \"discard\": Discard all unmapped reads.\n * \"use\": If read2 is unmapped, deduplicate using read1 only. Requires --paired. (default)\n * \"output\": Output unmapped reads/read pairs without UMI grouping/deduplication. Only available\n in umi_tools group.\n",
"enum": ["discard", "use", "output"]
}
,
"ignore_umi": {
"type":
"boolean",
"description": "Type: `boolean_true`, default: `false`. Ignore the UMI and group reads using mapping coordinates only",
"help_text": "Type: `boolean_true`, default: `false`. Ignore the UMI and group reads using mapping coordinates only."
,
"default": "False"
}
,
"subset": {
"type":
"number",
"description": "Type: `double`. Only consider a fraction of the reads, chosen at random",
"help_text": "Type: `double`. Only consider a fraction of the reads, chosen at random. This is useful for doing saturation\nanalyses.\n"
}
,
"chrom": {
"type":
"string",
"description": "Type: `string`. Only consider a single chromosome",
"help_text": "Type: `string`. Only consider a single chromosome. This is useful for debugging/testing purposes."
}
}
},
"group/dedup options" : {
"title": "Group/Dedup Options",
"type": "object",
"description": "No description",
"properties": {
"no_sort_output": {
"type":
"boolean",
"description": "Type: `boolean_true`, default: `false`. By default, output is sorted",
"help_text": "Type: `boolean_true`, default: `false`. By default, output is sorted. This involves the use of a temporary unsorted file (saved in\n--temp_dir). Use this option to turn off sorting.\n"
,
"default": "False"
}
,
"buffer_whole_contig": {
"type":
"boolean",
"description": "Type: `boolean_true`, default: `false`. Forces dedup to parse an entire contig before yielding any reads for deduplication",
"help_text": "Type: `boolean_true`, default: `false`. Forces dedup to parse an entire contig before yielding any reads for deduplication. This is the\nonly way to absolutely guarantee that all reads with the same start position are grouped together\nfor deduplication since dedup uses the start position of the read, not the alignment coordinate on\nwhich the reads are sorted. However, by default, dedup reads for another 1000bp before outputting\nread groups which will avoid any reads being missed with short read sequencing (\u003c1000bp).\n"
,
"default": "False"
}
}
},
"common options" : {
"title": "Common Options",
"type": "object",
"description": "No description",
"properties": {
"log": {
"type":
"string",
"description": "Type: `file`. File with logging information",
"help_text": "Type: `file`. File with logging information."
}
,
"log2stderr": {
"type":
"boolean",
"description": "Type: `boolean_true`, default: `false`. Send logging information to stderr",
"help_text": "Type: `boolean_true`, default: `false`. Send logging information to stderr."
,
"default": "False"
}
,
"verbose": {
"type":
"integer",
"description": "Type: `integer`, example: `0`. Log level",
"help_text": "Type: `integer`, example: `0`. Log level. The higher, the more output. Default: `0`.\n"
}
,
"error": {
"type":
"string",
"description": "Type: `file`. File with error information",
"help_text": "Type: `file`. File with error information."
}
,
"temp_dir": {
"type":
"string",
"description": "Type: `string`. Directory for temporary files",
"help_text": "Type: `string`. Directory for temporary files. If not set, the bash environmental variable TMPDIR is used.\n"
}
,
"compresslevel": {
"type":
"integer",
"description": "Type: `integer`, example: `6`. Level of Gzip compression to use",
"help_text": "Type: `integer`, example: `6`. Level of Gzip compression to use. Default=6 matches GNU gzip rather than python gzip default.\nDefault: `6`.\n"
}
,
"timeit": {
"type":
"string",
"description": "Type: `file`. Store timing information in file",
"help_text": "Type: `file`. Store timing information in file."
}
,
"timeit_name": {
"type":
"string",
"description": "Type: `string`, example: `all`. Name in timing file for this class of jobs",
"help_text": "Type: `string`, example: `all`. Name in timing file for this class of jobs. Default: `all`.\n"
}
,
"timeit_header": {
"type":
"string",
"description": "Type: `string`. Add header for timing information",
"help_text": "Type: `string`. Add header for timing information."
}
}
},
"nextflow input-output arguments" : {
"title": "Nextflow input-output arguments",
"type": "object",
"description": "Input/output parameters for Nextflow itself. Please note that both publishDir and publish_dir are supported but at least one has to be configured.",
"properties": {
"publish_dir": {
"type":
"string",
"description": "Type: `string`, required, example: `output/`. Path to an output directory",
"help_text": "Type: `string`, required, example: `output/`. Path to an output directory."
}
,
"param_list": {
"type":
"string",
"description": "Type: `string`, example: `my_params.yaml`. Allows inputting multiple parameter sets to initialise a Nextflow channel",
"help_text": "Type: `string`, example: `my_params.yaml`. Allows inputting multiple parameter sets to initialise a Nextflow channel. A `param_list` can either be a list of maps, a csv file, a json file, a yaml file, or simply a yaml blob.\n\n* A list of maps (as-is) where the keys of each map corresponds to the arguments of the pipeline. Example: in a `nextflow.config` file: `param_list: [ [\u0027id\u0027: \u0027foo\u0027, \u0027input\u0027: \u0027foo.txt\u0027], [\u0027id\u0027: \u0027bar\u0027, \u0027input\u0027: \u0027bar.txt\u0027] ]`.\n* A csv file should have column names which correspond to the different arguments of this pipeline. Example: `--param_list data.csv` with columns `id,input`.\n* A json or a yaml file should be a list of maps, each of which has keys corresponding to the arguments of the pipeline. Example: `--param_list data.json` with contents `[ {\u0027id\u0027: \u0027foo\u0027, \u0027input\u0027: \u0027foo.txt\u0027}, {\u0027id\u0027: \u0027bar\u0027, \u0027input\u0027: \u0027bar.txt\u0027} ]`.\n* A yaml blob can also be passed directly as a string. Example: `--param_list \"[ {\u0027id\u0027: \u0027foo\u0027, \u0027input\u0027: \u0027foo.txt\u0027}, {\u0027id\u0027: \u0027bar\u0027, \u0027input\u0027: \u0027bar.txt\u0027} ]\"`.\n\nWhen passing a csv, json or yaml file, relative path names are relativized to the location of the parameter file. No relativation is performed when `param_list` is a list of maps (as-is) or a yaml blob.",
"hidden": true
}
}
}
},
"allOf": [
{
"$ref": "#/definitions/inputs"
},
{
"$ref": "#/definitions/outputs"
},
{
"$ref": "#/definitions/grouping options"
},
{
"$ref": "#/definitions/single-cell rna-seq options"
},
{
"$ref": "#/definitions/sam/bam options"
},
{
"$ref": "#/definitions/group/dedup options"
},
{
"$ref": "#/definitions/common options"
},
{
"$ref": "#/definitions/nextflow input-output arguments"
}
]
}