biobox/target/executable/umi_tools/umi_tools_dedup/.config.vsh.yaml

name: "umi_tools_dedup"
namespace: "umi_tools"
version: "main"
authors:
- name: "Emma Rousseau"
  roles:
  - "author"
  - "maintainer"
  info:
    links:
      email: "emma@data-intuitive.com"
      github: "emmarousseau"
      linkedin: "emmarousseau1"
    organizations:
    - name: "Data Intuitive"
      href: "https://www.data-intuitive.com"
      role: "Bioinformatician"
argument_groups:
- name: "Inputs"
  arguments:
  - type: "file"
    name: "--input"
    alternatives:
    - "--stdin"
    description: "Input BAM or SAM file. Use --in_sam to specify SAM format."
    info: null
    must_exist: true
    create_parent: true
    required: true
    direction: "input"
    multiple: false
    multiple_sep: ";"
  - type: "boolean_true"
    name: "--in_sam"
    description: "By default, inputs are assumed to be in BAM format. Use this options\
      \ to specify the use of SAM\nformat for input.\n"
    info: null
    direction: "input"
  - type: "file"
    name: "--bai"
    description: "BAM index"
    info: null
    must_exist: true
    create_parent: true
    required: false
    direction: "input"
    multiple: false
    multiple_sep: ";"
  - type: "integer"
    name: "--random_seed"
    description: "Random seed to initialize number generator with."
    info: null
    required: false
    direction: "input"
    multiple: false
    multiple_sep: ";"
- name: "Outputs"
  arguments:
  - type: "file"
    name: "--output"
    alternatives:
    - "--stdout"
    description: "Deduplicated BAM file."
    info: null
    must_exist: true
    create_parent: true
    required: true
    direction: "output"
    multiple: false
    multiple_sep: ";"
  - type: "boolean_true"
    name: "--out_sam"
    description: "By default, outputa are written in BAM format. Use this options\
      \ to specify the use of SAM format\nfor output.\n"
    info: null
    direction: "input"
  - type: "boolean_true"
    name: "--paired"
    description: "BAM is paired end - output both read pairs. This will also force\
      \ the use of the template length\nto determine reads with the same mapping coordinates.\n"
    info: null
    direction: "input"
  - type: "string"
    name: "--output_stats"
    description: "Generate files containing UMI based deduplication statistics files\
      \ with this prefix in the file names.\n"
    info: null
    required: false
    direction: "input"
    multiple: false
    multiple_sep: ";"
  - type: "string"
    name: "--extract_umi_method"
    description: "Specify the method by which the barcodes were encoded in the read.\n\
      The options are:\n  * read_id (default) \n  * tag\n  * umis\n"
    info: null
    example:
    - "read_id"
    required: false
    choices:
    - "read_id"
    - "tag"
    - "umis"
    direction: "input"
    multiple: false
    multiple_sep: ";"
  - type: "string"
    name: "--umi_tag"
    description: "The tag containing the UMI sequence. This is only required if the\
      \ extract_umi_method is set to tag.\n"
    info: null
    required: false
    direction: "input"
    multiple: false
    multiple_sep: ";"
  - type: "string"
    name: "--umi_separator"
    description: "The separator used to separate the UMI from the read sequence. This\
      \ is only required if the\nextract_umi_method is set to id_read. Default: `_`.\n"
    info: null
    example:
    - "_"
    required: false
    direction: "input"
    multiple: false
    multiple_sep: ";"
  - type: "string"
    name: "--umi_tag_split"
    description: "Separate the UMI in tag by <SPLIT> and take the first element."
    info: null
    required: false
    direction: "input"
    multiple: false
    multiple_sep: ";"
  - type: "string"
    name: "--umi_tag_delimiter"
    description: "Separate the UMI in by <DELIMITER> and concatenate the elements."
    info: null
    required: false
    direction: "input"
    multiple: false
    multiple_sep: ";"
  - type: "string"
    name: "--cell_tag"
    description: "The tag containing the cell barcode sequence. This is only required\
      \ if the extract_umi_method\nis set to tag.\n"
    info: null
    required: false
    direction: "input"
    multiple: false
    multiple_sep: ";"
  - type: "string"
    name: "--cell_tag_split"
    description: "Separate the cell barcode in tag by <SPLIT> and take the first element."
    info: null
    required: false
    direction: "input"
    multiple: false
    multiple_sep: ";"
  - type: "string"
    name: "--cell_tag_delimiter"
    description: "Separate the cell barcode in by <DELIMITER> and concatenate the\
      \ elements."
    info: null
    required: false
    direction: "input"
    multiple: false
    multiple_sep: ";"
- name: "Grouping Options"
  arguments:
  - type: "string"
    name: "--method"
    description: "The method to use for grouping reads. \nThe options are: \n  * unique\n\
      \  * percentile\n  * cluster\n  * adjacency\n  * directional (default)\n"
    info: null
    example:
    - "directional"
    required: false
    choices:
    - "unique"
    - "percentile"
    - "cluster"
    - "adjacency"
    - "directional"
    direction: "input"
    multiple: false
    multiple_sep: ";"
  - type: "integer"
    name: "--edit_distance_threshold"
    description: "For the adjacency and cluster methods the threshold for the edit\
      \ distance to connect two\nUMIs in the network can be increased. The default\
      \ value of 1 works best unless the UMI is\nvery long (>14bp). Default: `1`.\n"
    info: null
    example:
    - 1
    required: false
    direction: "input"
    multiple: false
    multiple_sep: ";"
  - type: "boolean_true"
    name: "--spliced_is_unique"
    description: "Causes two reads that start in the same position on the same strand\
      \ and having the same UMI\nto be considered unique if one is spliced and the\
      \ other is not. (Uses the 'N' cigar operation\nto test for splicing).\n"
    info: null
    direction: "input"
  - type: "integer"
    name: "--soft_clip_threshold"
    description: "Mappers that soft clip will sometimes do so rather than mapping\
      \ a spliced read if there is only\na small overhang over the exon junction.\
      \ By setting this option, you can treat reads with at\nleast this many bases\
      \ soft-clipped at the 3' end as spliced. Default: `4`.\n"
    info: null
    example:
    - 4
    required: false
    direction: "input"
    multiple: false
    multiple_sep: ";"
  - type: "string"
    name: "--multimapping_detection_method"
    description: "If the sam/bam contains tags to identify multimapping reads, you\
      \ can specify for use when selecting\nthe best read at a given loci. Supported\
      \ tags are `NH`, `X0` and `XT`. If not specified, the read\nwith the highest\
      \ mapping quality will be selected.\n"
    info: null
    required: false
    direction: "input"
    multiple: false
    multiple_sep: ";"
  - type: "boolean_true"
    name: "--read_length"
    description: "Use the read length as a criteria when deduping, for e.g. sRNA-Seq."
    info: null
    direction: "input"
- name: "Single-cell RNA-Seq Options"
  arguments:
  - type: "boolean_true"
    name: "--per_gene"
    description: "Reads will be grouped together if they have the same gene. This\
      \ is useful if your library prep\ngenerates PCR duplicates with non identical\
      \ alignment positions such as CEL-Seq. Note this option\nis hardcoded to be\
      \ on with the count command. I.e. counting is always performed per-gene. Must\
      \ be\ncombined with either --gene_tag or --per_contig option.\n"
    info: null
    direction: "input"
  - type: "string"
    name: "--gene_tag"
    description: "Deduplicate per gene. The gene information is encoded in the bam\
      \ read tag specified.\n"
    info: null
    required: false
    direction: "input"
    multiple: false
    multiple_sep: ";"
  - type: "string"
    name: "--assigned_status_tag"
    description: "BAM tag which describes whether a read is assigned to a gene. Defaults\
      \ to the same value as given\nfor --gene_tag.\n"
    info: null
    required: false
    direction: "input"
    multiple: false
    multiple_sep: ";"
  - type: "string"
    name: "--skip_tags_regex"
    description: "Use in conjunction with the --assigned_status_tag option to skip\
      \ any reads where the tag matches\nthis regex. Default (\"^[__|Unassigned]\"\
      ) matches anything which starts with \"__\" or \"Unassigned\".\n"
    info: null
    required: false
    direction: "input"
    multiple: false
    multiple_sep: ";"
  - type: "boolean_true"
    name: "--per_contig"
    description: "Deduplicate per contig (field 3 in BAM; RNAME). All reads with the\
      \ sam contig will be considered to\nhave the same alignment position. This is\
      \ useful if you have aligned to a reference transcriptome\nwith one transcript\
      \ per gene. If you have aligned to a transcriptome with more than one transcript\n\
      per gene, you can supply a map between transcripts and gene using the --gene_transcript_map\
      \ option.\n"
    info: null
    direction: "input"
  - type: "file"
    name: "--gene_transcript_map"
    description: "A file containing a mapping between gene names and transcript names.\
      \ The file should be tab\nseparated with the gene name in the first column and\
      \ the transcript name in the second column.\n"
    info: null
    must_exist: true
    create_parent: true
    required: false
    direction: "input"
    multiple: false
    multiple_sep: ";"
  - type: "boolean_true"
    name: "--per_cell"
    description: "Reads will only be grouped together if they have the same cell barcode.\
      \ Can be combined with\n--per_gene.\n"
    info: null
    direction: "input"
- name: "SAM/BAM Options"
  arguments:
  - type: "integer"
    name: "--mapping_quality"
    description: "Minimium mapping quality (MAPQ) for a read to be retained. Default:\
      \ `0`.\n"
    info: null
    example:
    - 0
    required: false
    direction: "input"
    multiple: false
    multiple_sep: ";"
  - type: "string"
    name: "--unmapped_reads"
    description: "How unmapped reads should be handled. \nThe options are:\n  * \"\
      discard\": Discard all unmapped reads. (default)\n  * \"use\":     If read2\
      \ is unmapped, deduplicate using read1 only. Requires --paired.\n  * \"output\"\
      :  Output unmapped reads/read pairs without UMI grouping/deduplication. Only\
      \ available in umi_tools group.\n"
    info: null
    example:
    - "discard"
    required: false
    direction: "input"
    multiple: false
    multiple_sep: ";"
  - type: "string"
    name: "--chimeric_pairs"
    description: "How chimeric pairs should be handled. \nThe options are:\n  * \"\
      discard\": Discard all chimeric read pairs.\n  * \"use\":     Deduplicate using\
      \ read1 only. (default)\n  * \"output\":  Output chimeric pairs without UMI\
      \ grouping/deduplication. Only available in\n               umi_tools group.\n"
    info: null
    example:
    - "use"
    required: false
    choices:
    - "discard"
    - "use"
    - "output"
    direction: "input"
    multiple: false
    multiple_sep: ";"
  - type: "string"
    name: "--unpaired_reads"
    description: "How unpaired reads should be handled. \nThe options are: \n  * \"\
      discard\": Discard all unmapped reads.\n  * \"use\": If read2 is unmapped, deduplicate\
      \ using read1 only. Requires --paired. (default)\n  * \"output\":  Output unmapped\
      \ reads/read pairs without UMI grouping/deduplication. Only available\n    \
      \           in umi_tools group.\n"
    info: null
    example:
    - "use"
    required: false
    choices:
    - "discard"
    - "use"
    - "output"
    direction: "input"
    multiple: false
    multiple_sep: ";"
  - type: "boolean_true"
    name: "--ignore_umi"
    description: "Ignore the UMI and group reads using mapping coordinates only."
    info: null
    direction: "input"
  - type: "double"
    name: "--subset"
    description: "Only consider a fraction of the reads, chosen at random. This is\
      \ useful for doing saturation\nanalyses.\n"
    info: null
    required: false
    direction: "input"
    multiple: false
    multiple_sep: ";"
  - type: "string"
    name: "--chrom"
    description: "Only consider a single chromosome. This is useful for debugging/testing\
      \ purposes."
    info: null
    required: false
    direction: "input"
    multiple: false
    multiple_sep: ";"
- name: "Group/Dedup Options"
  arguments:
  - type: "boolean_true"
    name: "--no_sort_output"
    description: "By default, output is sorted. This involves the use of a temporary\
      \ unsorted file (saved in\n--temp_dir). Use this option to turn off sorting.\n"
    info: null
    direction: "input"
  - type: "boolean_true"
    name: "--buffer_whole_contig"
    description: "Forces dedup to parse an entire contig before yielding any reads\
      \ for deduplication. This is the\nonly way to absolutely guarantee that all\
      \ reads with the same start position are grouped together\nfor deduplication\
      \ since dedup uses the start position of the read, not the alignment coordinate\
      \ on\nwhich the reads are sorted. However, by default, dedup reads for another\
      \ 1000bp before outputting\nread groups which will avoid any reads being missed\
      \ with short read sequencing (<1000bp).\n"
    info: null
    direction: "input"
- name: "Common Options"
  arguments:
  - type: "file"
    name: "--log"
    alternatives:
    - "-L"
    description: "File with logging information."
    info: null
    must_exist: true
    create_parent: true
    required: false
    direction: "input"
    multiple: false
    multiple_sep: ";"
  - type: "boolean_true"
    name: "--log2stderr"
    description: "Send logging information to stderr."
    info: null
    direction: "input"
  - type: "integer"
    name: "--verbose"
    alternatives:
    - "-v"
    description: "Log level. The higher, the more output. Default: `0`.\n"
    info: null
    example:
    - 0
    required: false
    direction: "input"
    multiple: false
    multiple_sep: ";"
  - type: "file"
    name: "--error"
    alternatives:
    - "-E"
    description: "File with error information."
    info: null
    must_exist: true
    create_parent: true
    required: false
    direction: "input"
    multiple: false
    multiple_sep: ";"
  - type: "string"
    name: "--temp_dir"
    description: "Directory for temporary files. If not set, the bash environmental\
      \ variable TMPDIR is used.\n"
    info: null
    required: false
    direction: "input"
    multiple: false
    multiple_sep: ";"
  - type: "integer"
    name: "--compresslevel"
    description: "Level of Gzip compression to use. Default=6 matches GNU gzip rather\
      \ than python gzip default.\nDefault: `6`.\n"
    info: null
    example:
    - 6
    required: false
    direction: "input"
    multiple: false
    multiple_sep: ";"
  - type: "file"
    name: "--timeit"
    description: "Store timing information in file."
    info: null
    must_exist: true
    create_parent: true
    required: false
    direction: "input"
    multiple: false
    multiple_sep: ";"
  - type: "string"
    name: "--timeit_name"
    description: "Name in timing file for this class of jobs. Default: `all`.\n"
    info: null
    example:
    - "all"
    required: false
    direction: "input"
    multiple: false
    multiple_sep: ";"
  - type: "string"
    name: "--timeit_header"
    description: "Add header for timing information."
    info: null
    required: false
    direction: "input"
    multiple: false
    multiple_sep: ";"
resources:
- type: "bash_script"
  path: "script.sh"
  is_executable: true
description: "Deduplicate reads based on the mapping co-ordinate and the UMI attached\
  \ to the read.\n"
test_resources:
- type: "bash_script"
  path: "test.sh"
  is_executable: true
- type: "file"
  path: "test_data"
info: null
status: "enabled"
requirements:
  commands:
  - "ps"
keywords:
- "umi_tools"
- "deduplication"
- "dedup"
license: "MIT"
references:
  doi:
  - "10.1101/gr.209601.116"
links:
  repository: "https://github.com/CGATOxford/UMI-tools"
  homepage: "https://umi-tools.readthedocs.io/en/latest/"
  documentation: "https://umi-tools.readthedocs.io/en/latest/reference/dedup.html"
runners:
- type: "executable"
  id: "executable"
  docker_setup_strategy: "ifneedbepullelsecachedbuild"
- type: "nextflow"
  id: "nextflow"
  directives:
    tag: "$id"
  auto:
    simplifyInput: true
    simplifyOutput: false
    transcript: false
    publish: false
  config:
    labels:
      mem1gb: "memory = 1000000000.B"
      mem2gb: "memory = 2000000000.B"
      mem5gb: "memory = 5000000000.B"
      mem10gb: "memory = 10000000000.B"
      mem20gb: "memory = 20000000000.B"
      mem50gb: "memory = 50000000000.B"
      mem100gb: "memory = 100000000000.B"
      mem200gb: "memory = 200000000000.B"
      mem500gb: "memory = 500000000000.B"
      mem1tb: "memory = 1000000000000.B"
      mem2tb: "memory = 2000000000000.B"
      mem5tb: "memory = 5000000000000.B"
      mem10tb: "memory = 10000000000000.B"
      mem20tb: "memory = 20000000000000.B"
      mem50tb: "memory = 50000000000000.B"
      mem100tb: "memory = 100000000000000.B"
      mem200tb: "memory = 200000000000000.B"
      mem500tb: "memory = 500000000000000.B"
      mem1gib: "memory = 1073741824.B"
      mem2gib: "memory = 2147483648.B"
      mem4gib: "memory = 4294967296.B"
      mem8gib: "memory = 8589934592.B"
      mem16gib: "memory = 17179869184.B"
      mem32gib: "memory = 34359738368.B"
      mem64gib: "memory = 68719476736.B"
      mem128gib: "memory = 137438953472.B"
      mem256gib: "memory = 274877906944.B"
      mem512gib: "memory = 549755813888.B"
      mem1tib: "memory = 1099511627776.B"
      mem2tib: "memory = 2199023255552.B"
      mem4tib: "memory = 4398046511104.B"
      mem8tib: "memory = 8796093022208.B"
      mem16tib: "memory = 17592186044416.B"
      mem32tib: "memory = 35184372088832.B"
      mem64tib: "memory = 70368744177664.B"
      mem128tib: "memory = 140737488355328.B"
      mem256tib: "memory = 281474976710656.B"
      mem512tib: "memory = 562949953421312.B"
      cpu1: "cpus = 1"
      cpu2: "cpus = 2"
      cpu5: "cpus = 5"
      cpu10: "cpus = 10"
      cpu20: "cpus = 20"
      cpu50: "cpus = 50"
      cpu100: "cpus = 100"
      cpu200: "cpus = 200"
      cpu500: "cpus = 500"
      cpu1000: "cpus = 1000"
  debug: false
  container: "docker"
engines:
- type: "docker"
  id: "docker"
  image: "quay.io/biocontainers/umi_tools:1.1.5--py39hf95cd2a_1"
  target_registry: "images.viash-hub.com"
  target_tag: "main"
  namespace_separator: "/"
  setup:
  - type: "docker"
    run:
    - "umi_tools -v | sed 's/ version//g' > /var/software_versions.txt\n"
  entrypoint: []
  cmd: null
- type: "native"
  id: "native"
build_info:
  config: "src/umi_tools/umi_tools_dedup/config.vsh.yaml"
  runner: "executable"
  engine: "docker|native"
  output: "target/executable/umi_tools/umi_tools_dedup"
  executable: "target/executable/umi_tools/umi_tools_dedup/umi_tools_dedup"
  viash_version: "0.9.0-RC6"
  git_commit: "766ab6c9c3059004c7c3f205621909b2d8b0b26d"
  git_remote: "https://github.com/viash-hub/biobox"
package_config:
  name: "biobox"
  version: "main"
  description: "A collection of bioinformatics tools for working with sequence data.\n"
  info: null
  viash_version: "0.9.0-RC6"
  source: "src"
  target: "target"
  config_mods:
  - ".requirements.commands := ['ps']\n"
  - ".engines += { type: \"native\" }"
  - ".engines[.type == 'docker'].target_registry := 'images.viash-hub.com'"
  - ".engines[.type == 'docker'].target_tag := 'main'"
  keywords:
  - "bioinformatics"
  - "modules"
  - "sequencing"
  license: "MIT"
  organization: "vsh"
  links:
    repository: "https://github.com/viash-hub/biobox"
    issue_tracker: "https://github.com/viash-hub/biobox/issues"