Files
biobox/target/executable/arriba/.config.vsh.yaml
CI 6f2f840fd9 Build branch main with version main (7f8bcc2)
Build pipeline: viash-hub.biobox.main-zp6tq

Source commit: 7f8bcc2b3e

Source message: BD rhapsody sequence analysis (#96)

* wip

* fix test

* add help

* update 2.2 args

* fix bug

* extend test data

* output separate files

* analyse missing args

* tweaks to test

* fix script

* fix test

* fix test

* move small reference

* wip generate wta test data

* don't forget about umi in r1

* remove unneeded pkg

* load reference in memory just once

* fix random choices

* extend test

* add abc immunediscoverypanel

* wip abc testing code

* fix abc test; need unique instrument, run and flowcell ids for each sample

* add smk data

* add entry to changelog

* remove old test file

* adapt test for missing read

* update description

* add comment

* ensure cwl files are absolute

* Apply suggestions from code review

Co-authored-by: Dries Schaumont <5946712+DriesSchaumont@users.noreply.github.com>

* fix suggestion

* newer pipelines have docker requirements as a hint instead of a strict requirement

* rename str to content

* remove deleted resources

* fix containers

* fix script

* fix suggestion

* fix suggestion...

* fix test

* fix component name

* fix test

* apply suggestions

* fix test

* added note

* fix changelog

* fix changelog again

* splitting hairs here

---------

Co-authored-by: Dries Schaumont <5946712+DriesSchaumont@users.noreply.github.com>
2024-09-17 10:13:18 +00:00

734 lines
22 KiB
YAML

name: "arriba"
version: "main"
authors:
- name: "Robrecht Cannoodt"
roles:
- "author"
- "maintainer"
info:
links:
email: "robrecht@data-intuitive.com"
github: "rcannood"
orcid: "0000-0003-3641-729X"
linkedin: "robrechtcannoodt"
organizations:
- name: "Data Intuitive"
href: "https://www.data-intuitive.com"
role: "Data Science Engineer"
- name: "Open Problems"
href: "https://openproblems.bio"
role: "Core Member"
argument_groups:
- name: "Inputs"
arguments:
- type: "file"
name: "--bam"
alternatives:
- "-x"
description: "File in SAM/BAM/CRAM format with main alignments as generated by\
\ STAR\n(Aligned.out.sam). Arriba extracts candidate reads from this file.\n"
info: null
example:
- "Aligned.out.bam"
must_exist: true
create_parent: true
required: true
direction: "input"
multiple: false
multiple_sep: ";"
- type: "file"
name: "--genome"
alternatives:
- "-a"
description: "FastA file with genome sequence (assembly). The file may be gzip-compressed.\
\ An \nindex with the file extension .fai must exist only if CRAM files are\
\ processed.\n"
info: null
example:
- "assembly.fa"
must_exist: true
create_parent: true
required: true
direction: "input"
multiple: false
multiple_sep: ";"
- type: "file"
name: "--gene_annotation"
alternatives:
- "-g"
description: "GTF file with gene annotation. The file may be gzip-compressed.\n"
info: null
example:
- "annotation.gtf"
must_exist: true
create_parent: true
required: true
direction: "input"
multiple: false
multiple_sep: ";"
- type: "file"
name: "--known_fusions"
alternatives:
- "-k"
description: "File containing known/recurrent fusions. Some cancer entities are\
\ often \ncharacterized by fusions between the same pair of genes. In order\
\ to boost \nsensitivity, a list of known fusions can be supplied using this\
\ parameter. The list \nmust contain two columns with the names of the fused\
\ genes, separated by tabs.\n"
info: null
example:
- "known_fusions.tsv"
must_exist: true
create_parent: true
required: false
direction: "input"
multiple: false
multiple_sep: ";"
- type: "file"
name: "--blacklist"
alternatives:
- "-b"
description: "File containing blacklisted events (recurrent artifacts and transcripts\
\ \nobserved in healthy tissue).\n"
info: null
example:
- "blacklist.tsv"
must_exist: true
create_parent: true
required: false
direction: "input"
multiple: false
multiple_sep: ";"
- type: "file"
name: "--structural_variants"
alternatives:
- "-d"
description: "Tab-separated file with coordinates of structural variants found\
\ using \nwhole-genome sequencing data. These coordinates serve to increase\
\ sensitivity \ntowards weakly expressed fusions and to eliminate fusions with\
\ low evidence. \n"
info: null
example:
- "structural_variants_from_WGS.tsv"
must_exist: true
create_parent: true
required: false
direction: "input"
multiple: false
multiple_sep: ";"
- type: "file"
name: "--tags"
alternatives:
- "-t"
description: "Tab-separated file containing fusions to annotate with tags in the\
\ 'tags' column. \nThe first two columns specify the genes; the third column\
\ specifies the tag. The \nfile may be gzip-compressed. \n"
info: null
example:
- "tags.tsv"
must_exist: true
create_parent: true
required: false
direction: "input"
multiple: false
multiple_sep: ";"
- type: "file"
name: "--protein_domains"
alternatives:
- "-p"
description: "File in GFF3 format containing coordinates of the protein domains\
\ of genes. The\nprotein domains retained in a fusion are listed in the column\n\
'retained_protein_domains'. The file may be gzip-compressed.\n"
info: null
example:
- "protein_domains.gff3"
must_exist: true
create_parent: true
required: false
direction: "input"
multiple: false
multiple_sep: ";"
- name: "Outputs"
arguments:
- type: "file"
name: "--fusions"
alternatives:
- "-o"
description: "Output file with fusions that have passed all filters.\n"
info: null
example:
- "fusions.tsv"
must_exist: true
create_parent: true
required: true
direction: "output"
multiple: false
multiple_sep: ";"
- type: "file"
name: "--fusions_discarded"
alternatives:
- "-O"
description: "Output file with fusions that were discarded due to filtering. \n"
info: null
example:
- "fusions.discarded.tsv"
must_exist: true
create_parent: true
required: false
direction: "output"
multiple: false
multiple_sep: ";"
- name: "Arguments"
arguments:
- type: "long"
name: "--max_genomic_breakpoint_distance"
alternatives:
- "-D"
description: "When a file with genomic breakpoints obtained via \nwhole-genome\
\ sequencing is supplied via the --structural_variants\nparameter, this parameter\
\ determines how far a \ngenomic breakpoint may be away from a \ntranscriptomic\
\ breakpoint to consider it as a \nrelated event. For events inside genes, the\
\ \ndistance is added to the end of the gene; for \nintergenic events, the distance\
\ threshold is \napplied as is. Default: 100000.\n"
info: null
required: false
direction: "input"
multiple: false
multiple_sep: ";"
- type: "string"
name: "--strandedness"
alternatives:
- "-s"
description: "Whether a strand-specific protocol was used for library preparation,\
\ \nand if so, the type of strandedness (auto/yes/no/reverse). When \nunstranded\
\ data is processed, the strand can sometimes be inferred from \nsplice-patterns.\
\ But in unclear situations, stranded data helps \nresolve ambiguities. Default:\
\ auto\n"
info: null
required: false
choices:
- "auto"
- "yes"
- "no"
- "reverse"
direction: "input"
multiple: false
multiple_sep: ";"
- type: "string"
name: "--interesting_contigs"
alternatives:
- "-i"
description: "List of interesting contigs. Fusions between genes \non other contigs\
\ are ignored. Contigs can be specified with or without the \nprefix \"chr\"\
. Asterisks (*) are treated as wild-cards. \nDefault: 1 2 3 4 5 6 7 8 9 10 11\
\ 12 13 14 15 16 17 18 19 20 21 22 X Y AC_* NC_*\n"
info: null
example:
- "1"
- "2"
- "AC_*"
- "NC_*"
required: false
direction: "input"
multiple: true
multiple_sep: ";"
- type: "string"
name: "--viral_contigs"
alternatives:
- "-v"
description: "List of viral contigs. Asterisks (*) are treated as \nwild-cards.\n\
Default: AC_* NC_*\n"
info: null
example:
- "AC_*"
- "NC_*"
required: false
direction: "input"
multiple: true
multiple_sep: ";"
- type: "string"
name: "--disable_filters"
alternatives:
- "-f"
description: "List of filters to disable. By default all filters are \nenabled.\
\ \n"
info: null
required: false
choices:
- "homologs"
- "low_entropy"
- "isoforms"
- "top_expressed_viral_contigs"
- "viral_contigs"
- "uninteresting_contigs"
- "non_coding_neighbors"
- "mismatches"
- "duplicates"
- "no_genomic_support"
- "genomic_support"
- "intronic"
- "end_to_end"
- "relative_support"
- "low_coverage_viral_contigs"
- "merge_adjacent"
- "mismappers"
- "multimappers"
- "same_gene"
- "long_gap"
- "internal_tandem_duplication"
- "small_insert_size"
- "read_through"
- "inconsistently_clipped"
- "intragenic_exonic"
- "marginal_read_through"
- "spliced"
- "hairpin"
- "blacklist"
- "min_support"
- "select_best"
- "in_vitro"
- "short_anchor"
- "known_fusions"
- "no_coverage"
- "homopolymer"
- "many_spliced"
direction: "input"
multiple: true
multiple_sep: ";"
- type: "double"
name: "--max_e_value"
alternatives:
- "-E"
description: "Arriba estimates the number of fusions with a given number of supporting\
\ \nreads which one would expect to see by random chance. If the expected number\
\ \nof fusions (e-value) is higher than this threshold, the fusion is \ndiscarded\
\ by the 'relative_support' filter. Note: Increasing this \nthreshold can dramatically\
\ increase the number of false positives and may \nincrease the runtime of resource-intensive\
\ steps. Fractional values are \npossible. Default: 0.300000 \n"
info: null
required: false
direction: "input"
multiple: false
multiple_sep: ";"
- type: "integer"
name: "--min_supporting_reads"
alternatives:
- "-S"
description: "The 'min_support' filter discards all fusions with fewer than \n\
this many supporting reads (split reads and discordant mates \ncombined). Default:\
\ 2 \n"
info: null
example:
- 2
required: false
direction: "input"
multiple: false
multiple_sep: ";"
- type: "double"
name: "--max_mismappers"
alternatives:
- "-m"
description: "When more than this fraction of supporting reads turns out to be\
\ \nmismappers, the 'mismappers' filter discards the fusion. Default: \n0.800000\n"
info: null
example:
- 0.8
required: false
direction: "input"
multiple: false
multiple_sep: ";"
- type: "double"
name: "--max_homolog_identity"
alternatives:
- "-L"
description: "Genes with more than the given fraction of sequence identity are\
\ \nconsidered homologs and removed by the 'homologs' filter. \nDefault: 0.300000\
\ \n"
info: null
example:
- 0.3
required: false
direction: "input"
multiple: false
multiple_sep: ";"
- type: "integer"
name: "--homopolymer_length"
alternatives:
- "-H"
description: "The 'homopolymer' filter removes breakpoints adjacent to \nhomopolymers\
\ of the given length or more. Default: 6\n"
info: null
example:
- 6
required: false
direction: "input"
multiple: false
multiple_sep: ";"
- type: "integer"
name: "--read_through_distance"
alternatives:
- "-R"
description: "The 'read_through' filter removes read-through fusions \nwhere the\
\ breakpoints are less than the given distance away \nfrom each other. Default:\
\ 10000 \n"
info: null
example:
- 10000
required: false
direction: "input"
multiple: false
multiple_sep: ";"
- type: "integer"
name: "--min_anchor_length"
alternatives:
- "-A"
description: "Alignment artifacts are often characterized by split reads coming\
\ \nfrom only one gene and no discordant mates. Moreover, the split \nreads\
\ only align to a short stretch in one of the genes. The \n'short_anchor' filter\
\ removes these fusions. This parameter sets \nthe threshold in bp for what\
\ the filter considers short. Default: 23 \n"
info: null
example:
- 23
required: false
direction: "input"
multiple: false
multiple_sep: ";"
- type: "integer"
name: "--many_spliced_events"
alternatives:
- "-M"
description: "The 'many_spliced' filter recovers fusions between genes that \n\
have at least this many spliced breakpoints. Default: 4\n"
info: null
example:
- 4
required: false
direction: "input"
multiple: false
multiple_sep: ";"
- type: "double"
name: "--max_kmer_content"
alternatives:
- "-K"
description: "The 'low_entropy' filter removes reads with repetitive 3-mers. If\
\ \nthe 3-mers make up more than the given fraction of the sequence, then \n\
the read is discarded. Default: 0.600000 \n"
info: null
example:
- 0.6
required: false
direction: "input"
multiple: false
multiple_sep: ";"
- type: "double"
name: "--max_mismatch_pvalue"
alternatives:
- "-V"
description: "The 'mismatches' filter uses a binomial model to calculate a \n\
p-value for observing a given number of mismatches in a read. If \nthe number\
\ of mismatches is too high, the read is discarded. \nDefault: 0.010000 \n"
info: null
example:
- 0.05
required: false
direction: "input"
multiple: false
multiple_sep: ";"
- type: "integer"
name: "--fragment_length"
alternatives:
- "-F"
description: "When paired-end data is given, the fragment length is estimated\
\ \nautomatically and this parameter has no effect. But when single-end \ndata\
\ is given, the mean fragment length should be specified to \neffectively filter\
\ fusions that arise from hairpin structures. \nDefault: 200 \n"
info: null
example:
- 200
required: false
direction: "input"
multiple: false
multiple_sep: ";"
- type: "integer"
name: "--max_reads"
alternatives:
- "-U"
description: "Subsample fusions with more than the given number of supporting\
\ reads. This \nimproves performance without compromising sensitivity, as long\
\ as the \nthreshold is high. Counting of supporting reads beyond the threshold\
\ is \ninaccurate, obviously. Default: 300 \n"
info: null
example:
- 300
required: false
direction: "input"
multiple: false
multiple_sep: ";"
- type: "double"
name: "--quantile"
alternatives:
- "-Q"
description: "Highly expressed genes are prone to produce artifacts during library\
\ \npreparation. Genes with an expression above the given quantile are eligible\
\ \nfor filtering by the 'in_vitro' filter. Default: 0.998000\n"
info: null
example:
- 0.998
required: false
direction: "input"
multiple: false
multiple_sep: ";"
- type: "double"
name: "--exonic_fraction"
alternatives:
- "-e"
description: "The breakpoints of false-positive predictions of intragenic events\
\ \nare often both in exons. True predictions are more likely to have at \n\
least one breakpoint in an intron, because introns are larger. If the \nfraction\
\ of exonic sequence between two breakpoints is smaller than \nthe given fraction,\
\ the 'intragenic_exonic' filter discards the \nevent. Default: 0.330000 \n"
info: null
example:
- 0.33
required: false
direction: "input"
multiple: false
multiple_sep: ";"
- type: "integer"
name: "--top_n"
alternatives:
- "-T"
description: "Only report viral integration sites of the top N most highly expressed\
\ viral \ncontigs. Default: 5\n"
info: null
example:
- 5
required: false
direction: "input"
multiple: false
multiple_sep: ";"
- type: "double"
name: "--covered_fraction"
alternatives:
- "-C"
description: "Ignore virally associated events if the virus is not fully \nexpressed,\
\ i.e., less than the given fraction of the viral contig is \ntranscribed. Default:\
\ 0.050000 \n"
info: null
example:
- 0.05
required: false
direction: "input"
multiple: false
multiple_sep: ";"
- type: "integer"
name: "--max_itd_length"
alternatives:
- "-l"
description: "Maximum length of internal tandem duplications. Note: Increasing\
\ \nthis value beyond the default can impair performance and lead to many \n\
false positives. Default: 100 \n"
info: null
example:
- 100
required: false
direction: "input"
multiple: false
multiple_sep: ";"
- type: "double"
name: "--min_itd_allele_fraction"
alternatives:
- "-z"
description: "Required fraction of supporting reads to report an internal \ntandem\
\ duplication. Default: 0.070000 \n"
info: null
example:
- 0.07
required: false
direction: "input"
multiple: false
multiple_sep: ";"
- type: "integer"
name: "--min_itd_supporting_reads"
alternatives:
- "-Z"
description: "Required absolute number of supporting reads to report an \ninternal\
\ tandem duplication. Default: 10 \n"
info: null
example:
- 10
required: false
direction: "input"
multiple: false
multiple_sep: ";"
- type: "boolean_true"
name: "--skip_duplicate_marking"
alternatives:
- "-u"
description: "Instead of performing duplicate marking itself, Arriba relies on\
\ duplicate marking by a \npreceding program using the BAM_FDUP flag. This makes\
\ sense when unique molecular \nidentifiers (UMI) are used.\n"
info: null
direction: "input"
- type: "boolean_true"
name: "--extra_information"
alternatives:
- "-X"
description: "To reduce the runtime and file size, by default, the columns 'fusion_transcript',\
\ \n'peptide_sequence', and 'read_identifiers' are left empty in the file containing\
\ \ndiscarded fusion candidates (see parameter -O). When this flag is set, this\
\ extra \ninformation is reported in the discarded fusions file.\n"
info: null
direction: "input"
- type: "boolean_true"
name: "--fill_gaps"
alternatives:
- "-I"
description: "If assembly of the fusion transcript sequence from the supporting\
\ reads is incomplete \n(denoted as '...'), fill the gaps using the assembly\
\ sequence wherever possible. \n"
info: null
direction: "input"
resources:
- type: "bash_script"
path: "script.sh"
is_executable: true
description: "Detect gene fusions from RNA-Seq data"
test_resources:
- type: "bash_script"
path: "test.sh"
is_executable: true
- type: "file"
path: "test_data"
info: null
status: "enabled"
requirements:
cpus: 1
commands:
- "ps"
keywords:
- "Gene fusion"
- "RNA-Seq"
license: "MIT"
references:
doi:
- "10.1101/gr.257246.119"
links:
repository: "https://github.com/suhrig/arriba"
homepage: "https://arriba.readthedocs.io/en/latest/"
documentation: "https://arriba.readthedocs.io/en/latest/"
runners:
- type: "executable"
id: "executable"
docker_setup_strategy: "ifneedbepullelsecachedbuild"
- type: "nextflow"
id: "nextflow"
directives:
tag: "$id"
auto:
simplifyInput: true
simplifyOutput: false
transcript: false
publish: false
config:
labels:
mem1gb: "memory = 1000000000.B"
mem2gb: "memory = 2000000000.B"
mem5gb: "memory = 5000000000.B"
mem10gb: "memory = 10000000000.B"
mem20gb: "memory = 20000000000.B"
mem50gb: "memory = 50000000000.B"
mem100gb: "memory = 100000000000.B"
mem200gb: "memory = 200000000000.B"
mem500gb: "memory = 500000000000.B"
mem1tb: "memory = 1000000000000.B"
mem2tb: "memory = 2000000000000.B"
mem5tb: "memory = 5000000000000.B"
mem10tb: "memory = 10000000000000.B"
mem20tb: "memory = 20000000000000.B"
mem50tb: "memory = 50000000000000.B"
mem100tb: "memory = 100000000000000.B"
mem200tb: "memory = 200000000000000.B"
mem500tb: "memory = 500000000000000.B"
mem1gib: "memory = 1073741824.B"
mem2gib: "memory = 2147483648.B"
mem4gib: "memory = 4294967296.B"
mem8gib: "memory = 8589934592.B"
mem16gib: "memory = 17179869184.B"
mem32gib: "memory = 34359738368.B"
mem64gib: "memory = 68719476736.B"
mem128gib: "memory = 137438953472.B"
mem256gib: "memory = 274877906944.B"
mem512gib: "memory = 549755813888.B"
mem1tib: "memory = 1099511627776.B"
mem2tib: "memory = 2199023255552.B"
mem4tib: "memory = 4398046511104.B"
mem8tib: "memory = 8796093022208.B"
mem16tib: "memory = 17592186044416.B"
mem32tib: "memory = 35184372088832.B"
mem64tib: "memory = 70368744177664.B"
mem128tib: "memory = 140737488355328.B"
mem256tib: "memory = 281474976710656.B"
mem512tib: "memory = 562949953421312.B"
cpu1: "cpus = 1"
cpu2: "cpus = 2"
cpu5: "cpus = 5"
cpu10: "cpus = 10"
cpu20: "cpus = 20"
cpu50: "cpus = 50"
cpu100: "cpus = 100"
cpu200: "cpus = 200"
cpu500: "cpus = 500"
cpu1000: "cpus = 1000"
debug: false
container: "docker"
engines:
- type: "docker"
id: "docker"
image: "quay.io/biocontainers/arriba:2.4.0--h0033a41_2"
target_registry: "images.viash-hub.com"
target_tag: "main"
namespace_separator: "/"
setup:
- type: "docker"
run:
- "arriba -h | grep 'Version:' 2>&1 | sed 's/Version:\\s\\(.*\\)/arriba: \"\\\
1\"/' > /var/software_versions.txt\n"
entrypoint: []
cmd: null
- type: "native"
id: "native"
build_info:
config: "src/arriba/config.vsh.yaml"
runner: "executable"
engine: "docker|native"
output: "target/executable/arriba"
executable: "target/executable/arriba/arriba"
viash_version: "0.9.0"
git_commit: "7f8bcc2b3e1ffaac9778b6acb42420b19660d1a1"
git_remote: "https://x-access-token:ghs_aSDBedV4vU66pddFDN6d8UEy0ZQApn08RAsh@github.com/viash-hub/biobox"
git_tag: "v0.2.0-3-g7f8bcc2"
package_config:
name: "biobox"
version: "main"
description: "A collection of bioinformatics tools for working with sequence data.\n"
info: null
viash_version: "0.9.0"
source: "src"
target: "target"
config_mods:
- ".requirements.commands := ['ps']\n"
- ".engines += { type: \"native\" }"
- ".engines[.type == 'docker'].target_registry := 'images.viash-hub.com'"
- ".engines[.type == 'docker'].target_tag := 'main'"
keywords:
- "bioinformatics"
- "modules"
- "sequencing"
license: "MIT"
organization: "vsh"
links:
repository: "https://github.com/viash-hub/biobox"
issue_tracker: "https://github.com/viash-hub/biobox/issues"