Files
biobox/target/nextflow/arriba/nextflow_schema.json
CI 27d6133895 Build branch biobox/main with version main to biobox on branch main (9991e9a)
Build pipeline: viash-hub.biobox.main-l6hlj

Source commit: 9991e9a4f5

Source message: Bump bases2fastq to 2.2.1 (#202)
2025-10-03 08:31:27 +00:00

274 lines
14 KiB
JSON

{
"$schema": "https://json-schema.org/draft/2020-12/schema",
"title": "arriba",
"description": "Arriba is a command-line tool for the detection of gene fusions from RNA-Seq data. It was developed for the use in a clinical research setting. Therefore, short runtimes and high sensitivity were important design criteria.\n\nArriba is based on the STAR RNA-Seq aligner and post-processes the alignments (output from STAR) to:\n\n1. detect split reads and discordant mates, which are indicative of structural rearrangements,\n2. find reads supporting gene fusions (i.e., reads spanning the breakpoints of gene fusions),\n3. perform various filtering steps to remove false positives, and\n4. output the final predictions in a standardized format.\n\nIn contrast to many other fusion detection tools, Arriba does not require to reduce the STAR parameter `--alignIntronMax` (maximum intron size). Reducing this parameter impairs detection of long introns and may affect expression quantification. Arriba reliably filters translocation-based false positives even when large maximum intron sizes are used.\n\n**Important**: Arriba requires BAM files that were aligned with STAR using specific chimeric alignment parameters, particularly `--chimOutType WithinBAM HardClip`. See the [official workflow documentation](https://github.com/suhrig/arriba/blob/master/run_arriba.sh) for the complete set of recommended STAR parameters.\n",
"type": "object",
"$defs": {
"inputs": {
"title": "Inputs",
"type": "object",
"description": "No description",
"properties": {
"bam": {
"type": "string",
"format": "path",
"exists": true,
"description": "File in SAM/BAM/CRAM format with main alignments as generated by STAR\n(Aligned.out.sam)",
"help_text": "Type: `file`, multiple: `False`, required, direction: `input`, example: `\"Aligned.out.bam\"`. "
},
"genome": {
"type": "string",
"format": "path",
"exists": true,
"description": "FastA file with genome sequence (assembly)",
"help_text": "Type: `file`, multiple: `False`, required, direction: `input`, example: `\"assembly.fa\"`. "
},
"gene_annotation": {
"type": "string",
"format": "path",
"exists": true,
"description": "GTF file with gene annotation",
"help_text": "Type: `file`, multiple: `False`, required, direction: `input`, example: `\"annotation.gtf\"`. "
},
"known_fusions": {
"type": "string",
"format": "path",
"description": "File containing known/recurrent fusions",
"help_text": "Type: `file`, multiple: `False`, direction: `input`, example: `\"known_fusions.tsv\"`. "
},
"blacklist": {
"type": "string",
"format": "path",
"description": "File containing blacklisted events (recurrent artifacts and transcripts \nobserved in healthy tissue).\n",
"help_text": "Type: `file`, multiple: `False`, direction: `input`, example: `\"blacklist.tsv\"`. "
},
"structural_variants": {
"type": "string",
"format": "path",
"description": "Tab-separated file with coordinates of structural variants found using \nwhole-genome sequencing data",
"help_text": "Type: `file`, multiple: `False`, direction: `input`, example: `\"structural_variants_from_WGS.tsv\"`. "
},
"tags": {
"type": "string",
"format": "path",
"description": "Tab-separated file containing fusions to annotate with tags in the 'tags' column",
"help_text": "Type: `file`, multiple: `False`, direction: `input`, example: `\"tags.tsv\"`. "
},
"protein_domains": {
"type": "string",
"format": "path",
"description": "File in GFF3 format containing coordinates of the protein domains of genes",
"help_text": "Type: `file`, multiple: `False`, direction: `input`, example: `\"protein_domains.gff3\"`. "
}
}
},
"outputs": {
"title": "Outputs",
"type": "object",
"description": "No description",
"properties": {
"fusions": {
"type": "string",
"format": "path",
"description": "Output file with fusions that have passed all filters.\n",
"help_text": "Type: `file`, multiple: `False`, required, default: `\"$id.$key.fusions.tsv\"`, direction: `output`, example: `\"fusions.tsv\"`. ",
"default": "$id.$key.fusions.tsv"
},
"fusions_discarded": {
"type": "string",
"format": "path",
"description": "Output file with fusions that were discarded due to filtering",
"help_text": "Type: `file`, multiple: `False`, default: `\"$id.$key.fusions_discarded.tsv\"`, direction: `output`, example: `\"fusions.discarded.tsv\"`. ",
"default": "$id.$key.fusions_discarded.tsv"
}
}
},
"arguments": {
"title": "Arguments",
"type": "object",
"description": "No description",
"properties": {
"max_genomic_breakpoint_distance": {
"type": "string",
"description": "When a file with genomic breakpoints obtained via \nwhole-genome sequencing is supplied via the --structural_variants\nparameter, this parameter determines how far a \ngenomic breakpoint may be away from a \ntranscriptomic breakpoint to consider it as a \nrelated event",
"help_text": "Type: `long`, multiple: `False`. "
},
"strandedness": {
"type": "string",
"description": "Whether a strand-specific protocol was used for library preparation, \nand if so, the type of strandedness (auto/yes/no/reverse)",
"help_text": "Type: `string`, multiple: `False`, choices: ``auto`, `yes`, `no`, `reverse``. ",
"enum": [
"auto",
"yes",
"no",
"reverse"
]
},
"interesting_contigs": {
"type": "array",
"items": {
"type": "string"
},
"description": "List of interesting contigs",
"help_text": "Type: `string`, multiple: `True`, example: `[\"1\";\"2\";\"AC_*\";\"NC_*\"]`. "
},
"viral_contigs": {
"type": "array",
"items": {
"type": "string"
},
"description": "List of viral contigs",
"help_text": "Type: `string`, multiple: `True`, example: `[\"AC_*\";\"NC_*\"]`. "
},
"disable_filters": {
"type": "array",
"items": {
"type": "string"
},
"description": "List of filters to disable",
"help_text": "Type: `string`, multiple: `True`, choices: ``homologs`, `low_entropy`, `isoforms`, `top_expressed_viral_contigs`, `viral_contigs`, `uninteresting_contigs`, `non_coding_neighbors`, `mismatches`, `duplicates`, `no_genomic_support`, `genomic_support`, `intronic`, `end_to_end`, `relative_support`, `low_coverage_viral_contigs`, `merge_adjacent`, `mismappers`, `multimappers`, `same_gene`, `long_gap`, `internal_tandem_duplication`, `small_insert_size`, `read_through`, `inconsistently_clipped`, `intragenic_exonic`, `marginal_read_through`, `spliced`, `hairpin`, `blacklist`, `min_support`, `select_best`, `in_vitro`, `short_anchor`, `known_fusions`, `no_coverage`, `homopolymer`, `many_spliced``. "
},
"max_e_value": {
"type": "number",
"description": "Arriba estimates the number of fusions with a given number of supporting \nreads which one would expect to see by random chance",
"help_text": "Type: `double`, multiple: `False`. "
},
"min_supporting_reads": {
"type": "integer",
"description": "The 'min_support' filter discards all fusions with fewer than \nthis many supporting reads (split reads and discordant mates \ncombined)",
"help_text": "Type: `integer`, multiple: `False`, example: `2`. "
},
"max_mismappers": {
"type": "number",
"description": "When more than this fraction of supporting reads turns out to be \nmismappers, the 'mismappers' filter discards the fusion",
"help_text": "Type: `double`, multiple: `False`, example: `0.8`. "
},
"max_homolog_identity": {
"type": "number",
"description": "Genes with more than the given fraction of sequence identity are \nconsidered homologs and removed by the 'homologs' filter",
"help_text": "Type: `double`, multiple: `False`, example: `0.3`. "
},
"homopolymer_length": {
"type": "integer",
"description": "The 'homopolymer' filter removes breakpoints adjacent to \nhomopolymers of the given length or more",
"help_text": "Type: `integer`, multiple: `False`, example: `6`. "
},
"read_through_distance": {
"type": "integer",
"description": "The 'read_through' filter removes read-through fusions \nwhere the breakpoints are less than the given distance away \nfrom each other",
"help_text": "Type: `integer`, multiple: `False`, example: `10000`. "
},
"min_anchor_length": {
"type": "integer",
"description": "Alignment artifacts are often characterized by split reads coming \nfrom only one gene and no discordant mates",
"help_text": "Type: `integer`, multiple: `False`, example: `23`. "
},
"many_spliced_events": {
"type": "integer",
"description": "The 'many_spliced' filter recovers fusions between genes that \nhave at least this many spliced breakpoints",
"help_text": "Type: `integer`, multiple: `False`, example: `4`. "
},
"max_kmer_content": {
"type": "number",
"description": "The 'low_entropy' filter removes reads with repetitive 3-mers",
"help_text": "Type: `double`, multiple: `False`, example: `0.6`. "
},
"max_mismatch_pvalue": {
"type": "number",
"description": "The 'mismatches' filter uses a binomial model to calculate a \np-value for observing a given number of mismatches in a read",
"help_text": "Type: `double`, multiple: `False`, example: `0.05`. "
},
"fragment_length": {
"type": "integer",
"description": "When paired-end data is given, the fragment length is estimated \nautomatically and this parameter has no effect",
"help_text": "Type: `integer`, multiple: `False`, example: `200`. "
},
"max_reads": {
"type": "integer",
"description": "Subsample fusions with more than the given number of supporting reads",
"help_text": "Type: `integer`, multiple: `False`, example: `300`. "
},
"quantile": {
"type": "number",
"description": "Highly expressed genes are prone to produce artifacts during library \npreparation",
"help_text": "Type: `double`, multiple: `False`, example: `0.998`. "
},
"exonic_fraction": {
"type": "number",
"description": "The breakpoints of false-positive predictions of intragenic events \nare often both in exons",
"help_text": "Type: `double`, multiple: `False`, example: `0.33`. "
},
"top_n": {
"type": "integer",
"description": "Only report viral integration sites of the top N most highly expressed viral \ncontigs",
"help_text": "Type: `integer`, multiple: `False`, example: `5`. "
},
"covered_fraction": {
"type": "number",
"description": "Ignore virally associated events if the virus is not fully \nexpressed, i.e., less than the given fraction of the viral contig is \ntranscribed",
"help_text": "Type: `double`, multiple: `False`, example: `0.05`. "
},
"max_itd_length": {
"type": "integer",
"description": "Maximum length of internal tandem duplications",
"help_text": "Type: `integer`, multiple: `False`, example: `100`. "
},
"min_itd_allele_fraction": {
"type": "number",
"description": "Required fraction of supporting reads to report an internal \ntandem duplication",
"help_text": "Type: `double`, multiple: `False`, example: `0.07`. "
},
"min_itd_supporting_reads": {
"type": "integer",
"description": "Required absolute number of supporting reads to report an \ninternal tandem duplication",
"help_text": "Type: `integer`, multiple: `False`, example: `10`. "
},
"skip_duplicate_marking": {
"type": "boolean",
"description": "Instead of performing duplicate marking itself, Arriba relies on duplicate marking by a \npreceding program using the BAM_FDUP flag",
"help_text": "Type: `boolean_true`, multiple: `False`, default: `false`. ",
"default": false
},
"extra_information": {
"type": "boolean",
"description": "To reduce the runtime and file size, by default, the columns 'fusion_transcript', \n'peptide_sequence', and 'read_identifiers' are left empty in the file containing \ndiscarded fusion candidates (see parameter -O)",
"help_text": "Type: `boolean_true`, multiple: `False`, default: `false`. ",
"default": false
},
"fill_gaps": {
"type": "boolean",
"description": "If assembly of the fusion transcript sequence from the supporting reads is incomplete \n(denoted as '...'), fill the gaps using the assembly sequence wherever possible",
"help_text": "Type: `boolean_true`, multiple: `False`, default: `false`. ",
"default": false
}
}
},
"nextflow input-output arguments": {
"title": "Nextflow input-output arguments",
"type": "object",
"description": "Input/output parameters for Nextflow itself. Please note that both publishDir and publish_dir are supported but at least one has to be configured.",
"properties": {
"publish_dir": {
"type": "string",
"description": "Path to an output directory.",
"help_text": "Type: `string`, multiple: `False`, required, example: `\"output/\"`. "
}
}
}
},
"allOf": [
{
"$ref": "#/$defs/inputs"
},
{
"$ref": "#/$defs/outputs"
},
{
"$ref": "#/$defs/arguments"
},
{
"$ref": "#/$defs/nextflow input-output arguments"
}
]
}