biobox/target/executable/arriba/.config.vsh.yaml

name: "arriba"
version: "main"
authors:
- name: "Robrecht Cannoodt"
  roles:
  - "author"
  - "maintainer"
  info:
    links:
      email: "robrecht@data-intuitive.com"
      github: "rcannood"
      orcid: "0000-0003-3641-729X"
      linkedin: "robrechtcannoodt"
    organizations:
    - name: "Data Intuitive"
      href: "https://www.data-intuitive.com"
      role: "Data Science Engineer"
    - name: "Open Problems"
      href: "https://openproblems.bio"
      role: "Core Member"
argument_groups:
- name: "Inputs"
  arguments:
  - type: "file"
    name: "--bam"
    alternatives:
    - "-x"
    description: "File in SAM/BAM/CRAM format with main alignments as generated by\
      \ STAR\n(Aligned.out.sam). Arriba extracts candidate reads from this file.\n"
    info: null
    example:
    - "Aligned.out.bam"
    must_exist: true
    create_parent: true
    required: true
    direction: "input"
    multiple: false
    multiple_sep: ";"
  - type: "file"
    name: "--genome"
    alternatives:
    - "-a"
    description: "FastA file with genome sequence (assembly). The file may be gzip-compressed.\
      \ An \nindex with the file extension .fai must exist only if CRAM files are\
      \ processed.\n"
    info: null
    example:
    - "assembly.fa"
    must_exist: true
    create_parent: true
    required: true
    direction: "input"
    multiple: false
    multiple_sep: ";"
  - type: "file"
    name: "--gene_annotation"
    alternatives:
    - "-g"
    description: "GTF file with gene annotation. The file may be gzip-compressed.\n"
    info: null
    example:
    - "annotation.gtf"
    must_exist: true
    create_parent: true
    required: true
    direction: "input"
    multiple: false
    multiple_sep: ";"
  - type: "file"
    name: "--known_fusions"
    alternatives:
    - "-k"
    description: "File containing known/recurrent fusions. Some cancer entities are\
      \ often \ncharacterized by fusions between the same pair of genes. In order\
      \ to boost \nsensitivity, a list of known fusions can be supplied using this\
      \ parameter. The list \nmust contain two columns with the names of the fused\
      \ genes, separated by tabs.\n"
    info: null
    example:
    - "known_fusions.tsv"
    must_exist: true
    create_parent: true
    required: false
    direction: "input"
    multiple: false
    multiple_sep: ";"
  - type: "file"
    name: "--blacklist"
    alternatives:
    - "-b"
    description: "File containing blacklisted events (recurrent artifacts and transcripts\
      \ \nobserved in healthy tissue).\n"
    info: null
    example:
    - "blacklist.tsv"
    must_exist: true
    create_parent: true
    required: false
    direction: "input"
    multiple: false
    multiple_sep: ";"
  - type: "file"
    name: "--structural_variants"
    alternatives:
    - "-d"
    description: "Tab-separated file with coordinates of structural variants found\
      \ using \nwhole-genome sequencing data. These coordinates serve to increase\
      \ sensitivity \ntowards weakly expressed fusions and to eliminate fusions with\
      \ low evidence. \n"
    info: null
    example:
    - "structural_variants_from_WGS.tsv"
    must_exist: true
    create_parent: true
    required: false
    direction: "input"
    multiple: false
    multiple_sep: ";"
  - type: "file"
    name: "--tags"
    alternatives:
    - "-t"
    description: "Tab-separated file containing fusions to annotate with tags in the\
      \ 'tags' column. \nThe first two columns specify the genes; the third column\
      \ specifies the tag. The \nfile may be gzip-compressed. \n"
    info: null
    example:
    - "tags.tsv"
    must_exist: true
    create_parent: true
    required: false
    direction: "input"
    multiple: false
    multiple_sep: ";"
  - type: "file"
    name: "--protein_domains"
    alternatives:
    - "-p"
    description: "File in GFF3 format containing coordinates of the protein domains\
      \ of genes. The\nprotein domains retained in a fusion are listed in the column\n\
      'retained_protein_domains'. The file may be gzip-compressed.\n"
    info: null
    example:
    - "protein_domains.gff3"
    must_exist: true
    create_parent: true
    required: false
    direction: "input"
    multiple: false
    multiple_sep: ";"
- name: "Outputs"
  arguments:
  - type: "file"
    name: "--fusions"
    alternatives:
    - "-o"
    description: "Output file with fusions that have passed all filters.\n"
    info: null
    example:
    - "fusions.tsv"
    must_exist: true
    create_parent: true
    required: true
    direction: "output"
    multiple: false
    multiple_sep: ";"
  - type: "file"
    name: "--fusions_discarded"
    alternatives:
    - "-O"
    description: "Output file with fusions that were discarded due to filtering. \n"
    info: null
    example:
    - "fusions.discarded.tsv"
    must_exist: true
    create_parent: true
    required: false
    direction: "output"
    multiple: false
    multiple_sep: ";"
- name: "Arguments"
  arguments:
  - type: "long"
    name: "--max_genomic_breakpoint_distance"
    alternatives:
    - "-D"
    description: "When a file with genomic breakpoints obtained via \nwhole-genome\
      \ sequencing is supplied via the --structural_variants\nparameter, this parameter\
      \ determines how far a \ngenomic breakpoint may be away from a \ntranscriptomic\
      \ breakpoint to consider it as a \nrelated event. For events inside genes, the\
      \ \ndistance is added to the end of the gene; for \nintergenic events, the distance\
      \ threshold is \napplied as is. Default: 100000.\n"
    info: null
    required: false
    direction: "input"
    multiple: false
    multiple_sep: ";"
  - type: "string"
    name: "--strandedness"
    alternatives:
    - "-s"
    description: "Whether a strand-specific protocol was used for library preparation,\
      \ \nand if so, the type of strandedness (auto/yes/no/reverse). When \nunstranded\
      \ data is processed, the strand can sometimes be inferred from \nsplice-patterns.\
      \ But in unclear situations, stranded data helps \nresolve ambiguities. Default:\
      \ auto\n"
    info: null
    required: false
    choices:
    - "auto"
    - "yes"
    - "no"
    - "reverse"
    direction: "input"
    multiple: false
    multiple_sep: ";"
  - type: "string"
    name: "--interesting_contigs"
    alternatives:
    - "-i"
    description: "List of interesting contigs. Fusions between genes \non other contigs\
      \ are ignored. Contigs can be specified with or without the \nprefix \"chr\"\
      . Asterisks (*) are treated as wild-cards. \nDefault: 1 2 3 4 5 6 7 8 9 10 11\
      \ 12 13 14 15 16 17 18 19 20 21 22 X Y AC_* NC_*\n"
    info: null
    example:
    - "1"
    - "2"
    - "AC_*"
    - "NC_*"
    required: false
    direction: "input"
    multiple: true
    multiple_sep: ";"
  - type: "string"
    name: "--viral_contigs"
    alternatives:
    - "-v"
    description: "List of viral contigs. Asterisks (*) are treated as \nwild-cards.\n\
      Default: AC_* NC_*\n"
    info: null
    example:
    - "AC_*"
    - "NC_*"
    required: false
    direction: "input"
    multiple: true
    multiple_sep: ";"
  - type: "string"
    name: "--disable_filters"
    alternatives:
    - "-f"
    description: "List of filters to disable. By default all filters are \nenabled.\
      \ \n"
    info: null
    required: false
    choices:
    - "homologs"
    - "low_entropy"
    - "isoforms"
    - "top_expressed_viral_contigs"
    - "viral_contigs"
    - "uninteresting_contigs"
    - "non_coding_neighbors"
    - "mismatches"
    - "duplicates"
    - "no_genomic_support"
    - "genomic_support"
    - "intronic"
    - "end_to_end"
    - "relative_support"
    - "low_coverage_viral_contigs"
    - "merge_adjacent"
    - "mismappers"
    - "multimappers"
    - "same_gene"
    - "long_gap"
    - "internal_tandem_duplication"
    - "small_insert_size"
    - "read_through"
    - "inconsistently_clipped"
    - "intragenic_exonic"
    - "marginal_read_through"
    - "spliced"
    - "hairpin"
    - "blacklist"
    - "min_support"
    - "select_best"
    - "in_vitro"
    - "short_anchor"
    - "known_fusions"
    - "no_coverage"
    - "homopolymer"
    - "many_spliced"
    direction: "input"
    multiple: true
    multiple_sep: ";"
  - type: "double"
    name: "--max_e_value"
    alternatives:
    - "-E"
    description: "Arriba estimates the number of fusions with a given number of supporting\
      \ \nreads which one would expect to see by random chance. If the expected number\
      \ \nof fusions (e-value) is higher than this threshold, the fusion is \ndiscarded\
      \ by the 'relative_support' filter. Note: Increasing this \nthreshold can dramatically\
      \ increase the number of false positives and may \nincrease the runtime of resource-intensive\
      \ steps. Fractional values are \npossible. Default: 0.300000 \n"
    info: null
    required: false
    direction: "input"
    multiple: false
    multiple_sep: ";"
  - type: "integer"
    name: "--min_supporting_reads"
    alternatives:
    - "-S"
    description: "The 'min_support' filter discards all fusions with fewer than \n\
      this many supporting reads (split reads and discordant mates \ncombined). Default:\
      \ 2 \n"
    info: null
    example:
    - 2
    required: false
    direction: "input"
    multiple: false
    multiple_sep: ";"
  - type: "double"
    name: "--max_mismappers"
    alternatives:
    - "-m"
    description: "When more than this fraction of supporting reads turns out to be\
      \ \nmismappers, the 'mismappers' filter discards the fusion. Default: \n0.800000\n"
    info: null
    example:
    - 0.8
    required: false
    direction: "input"
    multiple: false
    multiple_sep: ";"
  - type: "double"
    name: "--max_homolog_identity"
    alternatives:
    - "-L"
    description: "Genes with more than the given fraction of sequence identity are\
      \ \nconsidered homologs and removed by the 'homologs' filter. \nDefault: 0.300000\
      \ \n"
    info: null
    example:
    - 0.3
    required: false
    direction: "input"
    multiple: false
    multiple_sep: ";"
  - type: "integer"
    name: "--homopolymer_length"
    alternatives:
    - "-H"
    description: "The 'homopolymer' filter removes breakpoints adjacent to \nhomopolymers\
      \ of the given length or more. Default: 6\n"
    info: null
    example:
    - 6
    required: false
    direction: "input"
    multiple: false
    multiple_sep: ";"
  - type: "integer"
    name: "--read_through_distance"
    alternatives:
    - "-R"
    description: "The 'read_through' filter removes read-through fusions \nwhere the\
      \ breakpoints are less than the given distance away \nfrom each other. Default:\
      \ 10000 \n"
    info: null
    example:
    - 10000
    required: false
    direction: "input"
    multiple: false
    multiple_sep: ";"
  - type: "integer"
    name: "--min_anchor_length"
    alternatives:
    - "-A"
    description: "Alignment artifacts are often characterized by split reads coming\
      \ \nfrom only one gene and no discordant mates. Moreover, the split \nreads\
      \ only align to a short stretch in one of the genes. The \n'short_anchor' filter\
      \ removes these fusions. This parameter sets \nthe threshold in bp for what\
      \ the filter considers short. Default: 23 \n"
    info: null
    example:
    - 23
    required: false
    direction: "input"
    multiple: false
    multiple_sep: ";"
  - type: "integer"
    name: "--many_spliced_events"
    alternatives:
    - "-M"
    description: "The 'many_spliced' filter recovers fusions between genes that \n\
      have at least this many spliced breakpoints. Default: 4\n"
    info: null
    example:
    - 4
    required: false
    direction: "input"
    multiple: false
    multiple_sep: ";"
  - type: "double"
    name: "--max_kmer_content"
    alternatives:
    - "-K"
    description: "The 'low_entropy' filter removes reads with repetitive 3-mers. If\
      \ \nthe 3-mers make up more than the given fraction of the sequence, then \n\
      the read is discarded. Default: 0.600000 \n"
    info: null
    example:
    - 0.6
    required: false
    direction: "input"
    multiple: false
    multiple_sep: ";"
  - type: "double"
    name: "--max_mismatch_pvalue"
    alternatives:
    - "-V"
    description: "The 'mismatches' filter uses a binomial model to calculate a \n\
      p-value for observing a given number of mismatches in a read. If \nthe number\
      \ of mismatches is too high, the read is discarded. \nDefault: 0.010000 \n"
    info: null
    example:
    - 0.05
    required: false
    direction: "input"
    multiple: false
    multiple_sep: ";"
  - type: "integer"
    name: "--fragment_length"
    alternatives:
    - "-F"
    description: "When paired-end data is given, the fragment length is estimated\
      \ \nautomatically and this parameter has no effect. But when single-end \ndata\
      \ is given, the mean fragment length should be specified to \neffectively filter\
      \ fusions that arise from hairpin structures. \nDefault: 200 \n"
    info: null
    example:
    - 200
    required: false
    direction: "input"
    multiple: false
    multiple_sep: ";"
  - type: "integer"
    name: "--max_reads"
    alternatives:
    - "-U"
    description: "Subsample fusions with more than the given number of supporting\
      \ reads. This \nimproves performance without compromising sensitivity, as long\
      \ as the \nthreshold is high. Counting of supporting reads beyond the threshold\
      \ is \ninaccurate, obviously. Default: 300 \n"
    info: null
    example:
    - 300
    required: false
    direction: "input"
    multiple: false
    multiple_sep: ";"
  - type: "double"
    name: "--quantile"
    alternatives:
    - "-Q"
    description: "Highly expressed genes are prone to produce artifacts during library\
      \ \npreparation. Genes with an expression above the given quantile are eligible\
      \ \nfor filtering by the 'in_vitro' filter. Default: 0.998000\n"
    info: null
    example:
    - 0.998
    required: false
    direction: "input"
    multiple: false
    multiple_sep: ";"
  - type: "double"
    name: "--exonic_fraction"
    alternatives:
    - "-e"
    description: "The breakpoints of false-positive predictions of intragenic events\
      \ \nare often both in exons. True predictions are more likely to have at \n\
      least one breakpoint in an intron, because introns are larger. If the \nfraction\
      \ of exonic sequence between two breakpoints is smaller than \nthe given fraction,\
      \ the 'intragenic_exonic' filter discards the \nevent. Default: 0.330000 \n"
    info: null
    example:
    - 0.33
    required: false
    direction: "input"
    multiple: false
    multiple_sep: ";"
  - type: "integer"
    name: "--top_n"
    alternatives:
    - "-T"
    description: "Only report viral integration sites of the top N most highly expressed\
      \ viral \ncontigs. Default: 5\n"
    info: null
    example:
    - 5
    required: false
    direction: "input"
    multiple: false
    multiple_sep: ";"
  - type: "double"
    name: "--covered_fraction"
    alternatives:
    - "-C"
    description: "Ignore virally associated events if the virus is not fully \nexpressed,\
      \ i.e., less than the given fraction of the viral contig is \ntranscribed. Default:\
      \ 0.050000 \n"
    info: null
    example:
    - 0.05
    required: false
    direction: "input"
    multiple: false
    multiple_sep: ";"
  - type: "integer"
    name: "--max_itd_length"
    alternatives:
    - "-l"
    description: "Maximum length of internal tandem duplications. Note: Increasing\
      \ \nthis value beyond the default can impair performance and lead to many \n\
      false positives. Default: 100 \n"
    info: null
    example:
    - 100
    required: false
    direction: "input"
    multiple: false
    multiple_sep: ";"
  - type: "double"
    name: "--min_itd_allele_fraction"
    alternatives:
    - "-z"
    description: "Required fraction of supporting reads to report an internal \ntandem\
      \ duplication. Default: 0.070000 \n"
    info: null
    example:
    - 0.07
    required: false
    direction: "input"
    multiple: false
    multiple_sep: ";"
  - type: "integer"
    name: "--min_itd_supporting_reads"
    alternatives:
    - "-Z"
    description: "Required absolute number of supporting reads to report an \ninternal\
      \ tandem duplication. Default: 10 \n"
    info: null
    example:
    - 10
    required: false
    direction: "input"
    multiple: false
    multiple_sep: ";"
  - type: "boolean_true"
    name: "--skip_duplicate_marking"
    alternatives:
    - "-u"
    description: "Instead of performing duplicate marking itself, Arriba relies on\
      \ duplicate marking by a \npreceding program using the BAM_FDUP flag. This makes\
      \ sense when unique molecular \nidentifiers (UMI) are used.\n"
    info: null
    direction: "input"
  - type: "boolean_true"
    name: "--extra_information"
    alternatives:
    - "-X"
    description: "To reduce the runtime and file size, by default, the columns 'fusion_transcript',\
      \ \n'peptide_sequence', and 'read_identifiers' are left empty in the file containing\
      \ \ndiscarded fusion candidates (see parameter -O). When this flag is set, this\
      \ extra \ninformation is reported in the discarded fusions file.\n"
    info: null
    direction: "input"
  - type: "boolean_true"
    name: "--fill_gaps"
    alternatives:
    - "-I"
    description: "If assembly of the fusion transcript sequence from the supporting\
      \ reads is incomplete \n(denoted as '...'), fill the gaps using the assembly\
      \ sequence wherever possible. \n"
    info: null
    direction: "input"
resources:
- type: "bash_script"
  path: "script.sh"
  is_executable: true
description: "Detect gene fusions from RNA-Seq data"
test_resources:
- type: "bash_script"
  path: "test.sh"
  is_executable: true
- type: "file"
  path: "test_data"
info: null
status: "enabled"
requirements:
  cpus: 1
  commands:
  - "ps"
keywords:
- "Gene fusion"
- "RNA-Seq"
license: "MIT"
references:
  doi:
  - "10.1101/gr.257246.119"
links:
  repository: "https://github.com/suhrig/arriba"
  homepage: "https://arriba.readthedocs.io/en/latest/"
  documentation: "https://arriba.readthedocs.io/en/latest/"
runners:
- type: "executable"
  id: "executable"
  docker_setup_strategy: "ifneedbepullelsecachedbuild"
- type: "nextflow"
  id: "nextflow"
  directives:
    tag: "$id"
  auto:
    simplifyInput: true
    simplifyOutput: false
    transcript: false
    publish: false
  config:
    labels:
      mem1gb: "memory = 1000000000.B"
      mem2gb: "memory = 2000000000.B"
      mem5gb: "memory = 5000000000.B"
      mem10gb: "memory = 10000000000.B"
      mem20gb: "memory = 20000000000.B"
      mem50gb: "memory = 50000000000.B"
      mem100gb: "memory = 100000000000.B"
      mem200gb: "memory = 200000000000.B"
      mem500gb: "memory = 500000000000.B"
      mem1tb: "memory = 1000000000000.B"
      mem2tb: "memory = 2000000000000.B"
      mem5tb: "memory = 5000000000000.B"
      mem10tb: "memory = 10000000000000.B"
      mem20tb: "memory = 20000000000000.B"
      mem50tb: "memory = 50000000000000.B"
      mem100tb: "memory = 100000000000000.B"
      mem200tb: "memory = 200000000000000.B"
      mem500tb: "memory = 500000000000000.B"
      mem1gib: "memory = 1073741824.B"
      mem2gib: "memory = 2147483648.B"
      mem4gib: "memory = 4294967296.B"
      mem8gib: "memory = 8589934592.B"
      mem16gib: "memory = 17179869184.B"
      mem32gib: "memory = 34359738368.B"
      mem64gib: "memory = 68719476736.B"
      mem128gib: "memory = 137438953472.B"
      mem256gib: "memory = 274877906944.B"
      mem512gib: "memory = 549755813888.B"
      mem1tib: "memory = 1099511627776.B"
      mem2tib: "memory = 2199023255552.B"
      mem4tib: "memory = 4398046511104.B"
      mem8tib: "memory = 8796093022208.B"
      mem16tib: "memory = 17592186044416.B"
      mem32tib: "memory = 35184372088832.B"
      mem64tib: "memory = 70368744177664.B"
      mem128tib: "memory = 140737488355328.B"
      mem256tib: "memory = 281474976710656.B"
      mem512tib: "memory = 562949953421312.B"
      cpu1: "cpus = 1"
      cpu2: "cpus = 2"
      cpu5: "cpus = 5"
      cpu10: "cpus = 10"
      cpu20: "cpus = 20"
      cpu50: "cpus = 50"
      cpu100: "cpus = 100"
      cpu200: "cpus = 200"
      cpu500: "cpus = 500"
      cpu1000: "cpus = 1000"
  debug: false
  container: "docker"
engines:
- type: "docker"
  id: "docker"
  image: "quay.io/biocontainers/arriba:2.4.0--h0033a41_2"
  target_registry: "images.viash-hub.com"
  target_tag: "main"
  namespace_separator: "/"
  setup:
  - type: "docker"
    run:
    - "arriba -h | grep 'Version:' 2>&1 |  sed 's/Version:\\s\\(.*\\)/arriba: \"\\\
      1\"/' > /var/software_versions.txt\n"
  entrypoint: []
  cmd: null
- type: "native"
  id: "native"
build_info:
  config: "src/arriba/config.vsh.yaml"
  runner: "executable"
  engine: "docker|native"
  output: "target/executable/arriba"
  executable: "target/executable/arriba/arriba"
  viash_version: "0.9.0"
  git_commit: "7f8bcc2b3e1ffaac9778b6acb42420b19660d1a1"
  git_remote: "https://x-access-token:ghs_aSDBedV4vU66pddFDN6d8UEy0ZQApn08RAsh@github.com/viash-hub/biobox"
  git_tag: "v0.2.0-3-g7f8bcc2"
package_config:
  name: "biobox"
  version: "main"
  description: "A collection of bioinformatics tools for working with sequence data.\n"
  info: null
  viash_version: "0.9.0"
  source: "src"
  target: "target"
  config_mods:
  - ".requirements.commands := ['ps']\n"
  - ".engines += { type: \"native\" }"
  - ".engines[.type == 'docker'].target_registry := 'images.viash-hub.com'"
  - ".engines[.type == 'docker'].target_tag := 'main'"
  keywords:
  - "bioinformatics"
  - "modules"
  - "sequencing"
  license: "MIT"
  organization: "vsh"
  links:
    repository: "https://github.com/viash-hub/biobox"
    issue_tracker: "https://github.com/viash-hub/biobox/issues"