Files
biobox/target/executable/cutadapt/.config.vsh.yaml
CI 6f2f840fd9 Build branch main with version main (7f8bcc2)
Build pipeline: viash-hub.biobox.main-zp6tq

Source commit: 7f8bcc2b3e

Source message: BD rhapsody sequence analysis (#96)

* wip

* fix test

* add help

* update 2.2 args

* fix bug

* extend test data

* output separate files

* analyse missing args

* tweaks to test

* fix script

* fix test

* fix test

* move small reference

* wip generate wta test data

* don't forget about umi in r1

* remove unneeded pkg

* load reference in memory just once

* fix random choices

* extend test

* add abc immunediscoverypanel

* wip abc testing code

* fix abc test; need unique instrument, run and flowcell ids for each sample

* add smk data

* add entry to changelog

* remove old test file

* adapt test for missing read

* update description

* add comment

* ensure cwl files are absolute

* Apply suggestions from code review

Co-authored-by: Dries Schaumont <5946712+DriesSchaumont@users.noreply.github.com>

* fix suggestion

* newer pipelines have docker requirements as a hint instead of a strict requirement

* rename str to content

* remove deleted resources

* fix containers

* fix script

* fix suggestion

* fix suggestion...

* fix test

* fix component name

* fix test

* apply suggestions

* fix test

* added note

* fix changelog

* fix changelog again

* splitting hairs here

---------

Co-authored-by: Dries Schaumont <5946712+DriesSchaumont@users.noreply.github.com>
2024-09-17 10:13:18 +00:00

768 lines
24 KiB
YAML

name: "cutadapt"
version: "main"
authors:
- name: "Toni Verbeiren"
roles:
- "author"
- "maintainer"
info:
links:
github: "tverbeiren"
linkedin: "verbeiren"
organizations:
- name: "Data Intuitive"
href: "https://www.data-intuitive.com"
role: "Data Scientist and CEO"
argument_groups:
- name: "Specify Adapters for R1"
arguments:
- type: "string"
name: "--adapter"
alternatives:
- "-a"
description: "Sequence of an adapter ligated to the 3' end (paired data:\nof the\
\ first read). The adapter and subsequent bases are\ntrimmed. If a '$' character\
\ is appended ('anchoring'), the\nadapter is only found if it is a suffix of\
\ the read.\n"
info: null
required: false
direction: "input"
multiple: true
multiple_sep: ";"
- type: "string"
name: "--front"
alternatives:
- "-g"
description: "Sequence of an adapter ligated to the 5' end (paired data:\nof the\
\ first read). The adapter and any preceding bases\nare trimmed. Partial matches\
\ at the 5' end are allowed. If\na '^' character is prepended ('anchoring'),\
\ the adapter is\nonly found if it is a prefix of the read.\n"
info: null
required: false
direction: "input"
multiple: true
multiple_sep: ";"
- type: "string"
name: "--anywhere"
alternatives:
- "-b"
description: "Sequence of an adapter that may be ligated to the 5' or 3'\nend\
\ (paired data: of the first read). Both types of\nmatches as described under\
\ -a and -g are allowed. If the\nfirst base of the read is part of the match,\
\ the behavior\nis as with -g, otherwise as with -a. This option is mostly\n\
for rescuing failed library preparations - do not use if\nyou know which end\
\ your adapter was ligated to!\n"
info: null
required: false
direction: "input"
multiple: true
multiple_sep: ";"
- name: "Specify Adapters using Fasta files for R1"
arguments:
- type: "file"
name: "--adapter_fasta"
description: "Fasta file containing sequences of an adapter ligated to the 3'\
\ end (paired data:\nof the first read). The adapter and subsequent bases are\n\
trimmed. If a '$' character is appended ('anchoring'), the\nadapter is only\
\ found if it is a suffix of the read.\n"
info: null
must_exist: true
create_parent: true
required: false
direction: "input"
multiple: true
multiple_sep: ";"
- type: "file"
name: "--front_fasta"
description: "Fasta file containing sequences of an adapter ligated to the 5'\
\ end (paired data:\nof the first read). The adapter and any preceding bases\n\
are trimmed. Partial matches at the 5' end are allowed. If\na '^' character\
\ is prepended ('anchoring'), the adapter is\nonly found if it is a prefix of\
\ the read.\n"
info: null
must_exist: true
create_parent: true
required: false
direction: "input"
multiple: false
multiple_sep: ";"
- type: "file"
name: "--anywhere_fasta"
description: "Fasta file containing sequences of an adapter that may be ligated\
\ to the 5' or 3'\nend (paired data: of the first read). Both types of\nmatches\
\ as described under -a and -g are allowed. If the\nfirst base of the read is\
\ part of the match, the behavior\nis as with -g, otherwise as with -a. This\
\ option is mostly\nfor rescuing failed library preparations - do not use if\n\
you know which end your adapter was ligated to!\n"
info: null
must_exist: true
create_parent: true
required: false
direction: "input"
multiple: false
multiple_sep: ";"
- name: "Specify Adapters for R2"
arguments:
- type: "string"
name: "--adapter_r2"
alternatives:
- "-A"
description: "Sequence of an adapter ligated to the 3' end (paired data:\nof the\
\ first read). The adapter and subsequent bases are\ntrimmed. If a '$' character\
\ is appended ('anchoring'), the\nadapter is only found if it is a suffix of\
\ the read.\n"
info: null
required: false
direction: "input"
multiple: true
multiple_sep: ";"
- type: "string"
name: "--front_r2"
alternatives:
- "-G"
description: "Sequence of an adapter ligated to the 5' end (paired data:\nof the\
\ first read). The adapter and any preceding bases\nare trimmed. Partial matches\
\ at the 5' end are allowed. If\na '^' character is prepended ('anchoring'),\
\ the adapter is\nonly found if it is a prefix of the read.\n"
info: null
required: false
direction: "input"
multiple: true
multiple_sep: ";"
- type: "string"
name: "--anywhere_r2"
alternatives:
- "-B"
description: "Sequence of an adapter that may be ligated to the 5' or 3'\nend\
\ (paired data: of the first read). Both types of\nmatches as described under\
\ -a and -g are allowed. If the\nfirst base of the read is part of the match,\
\ the behavior\nis as with -g, otherwise as with -a. This option is mostly\n\
for rescuing failed library preparations - do not use if\nyou know which end\
\ your adapter was ligated to!\n"
info: null
required: false
direction: "input"
multiple: true
multiple_sep: ";"
- name: "Specify Adapters using Fasta files for R2"
arguments:
- type: "file"
name: "--adapter_r2_fasta"
description: "Fasta file containing sequences of an adapter ligated to the 3'\
\ end (paired data:\nof the first read). The adapter and subsequent bases are\n\
trimmed. If a '$' character is appended ('anchoring'), the\nadapter is only\
\ found if it is a suffix of the read.\n"
info: null
must_exist: true
create_parent: true
required: false
direction: "input"
multiple: false
multiple_sep: ";"
- type: "file"
name: "--front_r2_fasta"
description: "Fasta file containing sequences of an adapter ligated to the 5'\
\ end (paired data:\nof the first read). The adapter and any preceding bases\n\
are trimmed. Partial matches at the 5' end are allowed. If\na '^' character\
\ is prepended ('anchoring'), the adapter is\nonly found if it is a prefix of\
\ the read.\n"
info: null
must_exist: true
create_parent: true
required: false
direction: "input"
multiple: false
multiple_sep: ";"
- type: "file"
name: "--anywhere_r2_fasta"
description: "Fasta file containing sequences of an adapter that may be ligated\
\ to the 5' or 3'\nend (paired data: of the first read). Both types of\nmatches\
\ as described under -a and -g are allowed. If the\nfirst base of the read is\
\ part of the match, the behavior\nis as with -g, otherwise as with -a. This\
\ option is mostly\nfor rescuing failed library preparations - do not use if\n\
you know which end your adapter was ligated to!\n"
info: null
must_exist: true
create_parent: true
required: false
direction: "input"
multiple: false
multiple_sep: ";"
- name: "Paired-end options"
arguments:
- type: "boolean_true"
name: "--pair_adapters"
description: "Treat adapters given with -a/-A etc. as pairs. Either both\nor none\
\ are removed from each read pair.\n"
info: null
direction: "input"
- type: "string"
name: "--pair_filter"
description: "Which of the reads in a paired-end read have to match the\nfiltering\
\ criterion in order for the pair to be filtered.\n"
info: null
required: false
choices:
- "any"
- "both"
- "first"
direction: "input"
multiple: false
multiple_sep: ";"
- type: "boolean_true"
name: "--interleaved"
description: "Read and/or write interleaved paired-end reads.\n"
info: null
direction: "input"
- name: "Input parameters"
arguments:
- type: "file"
name: "--input"
description: "Input fastq file for single-end reads or R1 for paired-end reads.\n"
info: null
must_exist: true
create_parent: true
required: true
direction: "input"
multiple: false
multiple_sep: ";"
- type: "file"
name: "--input_r2"
description: "Input fastq file for R2 in the case of paired-end reads.\n"
info: null
must_exist: true
create_parent: true
required: false
direction: "input"
multiple: false
multiple_sep: ";"
- type: "double"
name: "--error_rate"
alternatives:
- "-E"
- "--errors"
description: "Maximum allowed error rate (if 0 <= E < 1), or absolute\nnumber\
\ of errors for full-length adapter match (if E is an\ninteger >= 1). Error\
\ rate = no. of errors divided by\nlength of matching region. Default: 0.1 (10%).\n"
info: null
example:
- 0.1
required: false
direction: "input"
multiple: false
multiple_sep: ";"
- type: "boolean_false"
name: "--no_indels"
description: "Allow only mismatches in alignments.\n"
info: null
direction: "input"
- type: "integer"
name: "--times"
alternatives:
- "-n"
description: "Remove up to COUNT adapters from each read. Default: 1.\n"
info: null
example:
- 1
required: false
direction: "input"
multiple: false
multiple_sep: ";"
- type: "integer"
name: "--overlap"
alternatives:
- "-O"
description: "Require MINLENGTH overlap between read and adapter for an\nadapter\
\ to be found. The default is 3.\n"
info: null
example:
- 3
required: false
direction: "input"
multiple: false
multiple_sep: ";"
- type: "boolean_true"
name: "--match_read_wildcards"
description: "Interpret IUPAC wildcards in reads.\n"
info: null
direction: "input"
- type: "boolean_false"
name: "--no_match_adapter_wildcards"
description: "Do not interpret IUPAC wildcards in adapters.\n"
info: null
direction: "input"
- type: "string"
name: "--action"
description: "What to do if a match was found. trim: trim adapter and\nup- or\
\ downstream sequence; retain: trim, but retain\nadapter; mask: replace with\
\ 'N' characters; lowercase:\nconvert to lowercase; none: leave unchanged.\n\
The default is trim.\n"
info: null
example:
- "trim"
required: false
choices:
- "trim"
- "retain"
- "mask"
- "lowercase"
- "none"
direction: "input"
multiple: false
multiple_sep: ";"
- type: "boolean_true"
name: "--revcomp"
alternatives:
- "--rc"
description: "Check both the read and its reverse complement for adapter\nmatches.\
\ If match is on reverse-complemented version,\noutput that one.\n"
info: null
direction: "input"
- name: "Demultiplexing options"
arguments:
- type: "string"
name: "--demultiplex_mode"
description: "Enable demultiplexing and set the mode for it.\nWith mode 'unique_dual',\
\ adapters from the first and second read are used,\nand the indexes from the\
\ reads are only used in pairs. This implies\n--pair_adapters.\nEnabling mode\
\ 'combinatorial_dual' allows all combinations of the sets of indexes\non R1\
\ and R2. It is necessary to write each read pair to an output\nfile depending\
\ on the adapters found on both R1 and R2.\nMode 'single', uses indexes or barcodes\
\ located at the 5'\nend of the R1 read (single). \n"
info: null
required: false
choices:
- "single"
- "unique_dual"
- "combinatorial_dual"
direction: "input"
multiple: false
multiple_sep: ";"
- name: "Read modifications"
arguments:
- type: "integer"
name: "--cut"
alternatives:
- "-u"
description: "Remove LEN bases from each read (or R1 if paired; use --cut_r2\n\
option for R2). If LEN is positive, remove bases from the\nbeginning. If LEN\
\ is negative, remove bases from the end.\nCan be used twice if LENs have different\
\ signs. Applied\n*before* adapter trimming.\n"
info: null
required: false
direction: "input"
multiple: true
multiple_sep: ";"
- type: "integer"
name: "--cut_r2"
description: "Remove LEN bases from each read (for R2). If LEN is positive, remove\
\ bases from the\nbeginning. If LEN is negative, remove bases from the end.\n\
Can be used twice if LENs have different signs. Applied\n*before* adapter trimming.\n"
info: null
required: false
direction: "input"
multiple: true
multiple_sep: ";"
- type: "string"
name: "--nextseq_trim"
description: "NextSeq-specific quality trimming (each read). Trims also\ndark\
\ cycles appearing as high-quality G bases.\n"
info: null
required: false
direction: "input"
multiple: false
multiple_sep: ";"
- type: "string"
name: "--quality_cutoff"
alternatives:
- "-q"
description: "Trim low-quality bases from 5' and/or 3' ends of each read\nbefore\
\ adapter removal. Applied to both reads if data is\npaired. If one value is\
\ given, only the 3' end is trimmed.\nIf two comma-separated cutoffs are given,\
\ the 5' end is\ntrimmed with the first cutoff, the 3' end with the second.\n"
info: null
required: false
direction: "input"
multiple: false
multiple_sep: ";"
- type: "string"
name: "--quality_cutoff_r2"
alternatives:
- "-Q"
description: "Quality-trimming cutoff for R2. Default: same as for R1\n"
info: null
required: false
direction: "input"
multiple: false
multiple_sep: ";"
- type: "integer"
name: "--quality_base"
description: "Assume that quality values in FASTQ are encoded as\nascii(quality\
\ + N). This needs to be set to 64 for some\nold Illumina FASTQ files. The default\
\ is 33.\n"
info: null
example:
- 33
required: false
direction: "input"
multiple: false
multiple_sep: ";"
- type: "boolean_true"
name: "--poly_a"
description: "Trim poly-A tails"
info: null
direction: "input"
- type: "integer"
name: "--length"
alternatives:
- "-l"
description: "Shorten reads to LENGTH. Positive values remove bases at\nthe end\
\ while negative ones remove bases at the beginning.\nThis and the following\
\ modifications are applied after\nadapter trimming.\n"
info: null
required: false
direction: "input"
multiple: false
multiple_sep: ";"
- type: "boolean_true"
name: "--trim_n"
description: "Trim N's on ends of reads."
info: null
direction: "input"
- type: "string"
name: "--length_tag"
description: "Search for TAG followed by a decimal number in the\ndescription\
\ field of the read. Replace the decimal number\nwith the correct length of\
\ the trimmed read. For example,\nuse --length-tag 'length=' to correct fields\
\ like\n'length=123'.\n"
info: null
example:
- "length="
required: false
direction: "input"
multiple: false
multiple_sep: ";"
- type: "string"
name: "--strip_suffix"
description: "Remove this suffix from read names if present. Can be\ngiven multiple\
\ times.\n"
info: null
required: false
direction: "input"
multiple: false
multiple_sep: ";"
- type: "string"
name: "--prefix"
alternatives:
- "-x"
description: "Add this prefix to read names. Use {name} to insert the\nname of\
\ the matching adapter.\n"
info: null
required: false
direction: "input"
multiple: false
multiple_sep: ";"
- type: "string"
name: "--suffix"
alternatives:
- "-y"
description: "Add this suffix to read names; can also include {name}\n"
info: null
required: false
direction: "input"
multiple: false
multiple_sep: ";"
- type: "string"
name: "--rename"
description: "Rename reads using TEMPLATE containing variables such as\n{id},\
\ {adapter_name} etc. (see documentation)\n"
info: null
required: false
direction: "input"
multiple: false
multiple_sep: ";"
- type: "boolean_true"
name: "--zero_cap"
alternatives:
- "-z"
description: "Change negative quality values to zero."
info: null
direction: "input"
- name: "Filtering of processed reads"
description: "Filters are applied after above read modifications. Paired-end reads\
\ are\nalways discarded pairwise (see also --pair_filter).\n"
arguments:
- type: "string"
name: "--minimum_length"
alternatives:
- "-m"
description: "Discard reads shorter than LEN. Default is 0.\nWhen trimming paired-end\
\ reads, the minimum lengths for R1 and R2 can be specified separately by separating\
\ them with a colon (:).\nIf the colon syntax is not used, the same minimum\
\ length applies to both reads, as discussed above.\nAlso, one of the values\
\ can be omitted to impose no restrictions.\nFor example, with -m 17:, the length\
\ of R1 must be at least 17, but the length of R2 is ignored.\n"
info: null
example:
- "0"
required: false
direction: "input"
multiple: false
multiple_sep: ";"
- type: "string"
name: "--maximum_length"
alternatives:
- "-M"
description: "Discard reads longer than LEN. Default: no limit.\nFor paired reads,\
\ see the remark for --minimum_length\n"
info: null
required: false
direction: "input"
multiple: false
multiple_sep: ";"
- type: "string"
name: "--max_n"
description: "Discard reads with more than COUNT 'N' bases. If COUNT is\na number\
\ between 0 and 1, it is interpreted as a fraction\nof the read length.\n"
info: null
required: false
direction: "input"
multiple: false
multiple_sep: ";"
- type: "long"
name: "--max_expected_errors"
alternatives:
- "--max_ee"
description: "Discard reads whose expected number of errors (computed\nfrom quality\
\ values) exceeds ERRORS.\n"
info: null
required: false
direction: "input"
multiple: false
multiple_sep: ";"
- type: "long"
name: "--max_average_error_rate"
alternatives:
- "--max_aer"
description: "as --max_expected_errors (see above), but divided by\nlength to\
\ account for reads of varying length.\n"
info: null
required: false
direction: "input"
multiple: false
multiple_sep: ";"
- type: "boolean_true"
name: "--discard_trimmed"
alternatives:
- "--discard"
description: "Discard reads that contain an adapter. Use also -O to\navoid discarding\
\ too many randomly matching reads.\n"
info: null
direction: "input"
- type: "boolean_true"
name: "--discard_untrimmed"
alternatives:
- "--trimmed_only"
description: "Discard reads that do not contain an adapter.\n"
info: null
direction: "input"
- type: "boolean_true"
name: "--discard_casava"
description: "Discard reads that did not pass CASAVA filtering (header\nhas :Y:).\n"
info: null
direction: "input"
- name: "Output parameters"
arguments:
- type: "string"
name: "--report"
description: "Which type of report to print: 'full' (default) or 'minimal'.\n"
info: null
example:
- "full"
required: false
choices:
- "full"
- "minimal"
direction: "input"
multiple: false
multiple_sep: ";"
- type: "boolean_true"
name: "--json"
description: "Write report in JSON format to this file.\n"
info: null
direction: "input"
- type: "file"
name: "--output"
description: "Glob pattern for matching the expected output files.\nShould include\
\ `$output_dir`.\n"
info: null
example:
- "fastq/*_001.fast[a,q]"
must_exist: true
create_parent: true
required: true
direction: "output"
multiple: true
multiple_sep: ";"
- type: "boolean_true"
name: "--fasta"
description: "Output FASTA to standard output even on FASTQ input.\n"
info: null
direction: "input"
- type: "boolean_true"
name: "--info_file"
description: "Write information about each read and its adapter matches\ninto\
\ info.txt in the output directory.\nSee the documentation for the file format.\n"
info: null
direction: "input"
- name: "Debug"
arguments:
- type: "boolean_true"
name: "--debug"
description: "Print debug information"
info: null
direction: "input"
resources:
- type: "bash_script"
path: "script.sh"
is_executable: true
description: "Cutadapt removes adapter sequences from high-throughput sequencing reads.\n"
test_resources:
- type: "bash_script"
path: "test.sh"
is_executable: true
info: null
status: "enabled"
requirements:
commands:
- "ps"
keywords:
- "RNA-seq"
- "scRNA-seq"
- "high-throughput"
license: "MIT"
references:
doi:
- "10.14806/ej.17.1.200"
links:
repository: "https://github.com/marcelm/cutadapt"
homepage: "https://cutadapt.readthedocs.io"
documentation: "https://cutadapt.readthedocs.io"
runners:
- type: "executable"
id: "executable"
docker_setup_strategy: "ifneedbepullelsecachedbuild"
- type: "nextflow"
id: "nextflow"
directives:
tag: "$id"
auto:
simplifyInput: true
simplifyOutput: false
transcript: false
publish: false
config:
labels:
mem1gb: "memory = 1000000000.B"
mem2gb: "memory = 2000000000.B"
mem5gb: "memory = 5000000000.B"
mem10gb: "memory = 10000000000.B"
mem20gb: "memory = 20000000000.B"
mem50gb: "memory = 50000000000.B"
mem100gb: "memory = 100000000000.B"
mem200gb: "memory = 200000000000.B"
mem500gb: "memory = 500000000000.B"
mem1tb: "memory = 1000000000000.B"
mem2tb: "memory = 2000000000000.B"
mem5tb: "memory = 5000000000000.B"
mem10tb: "memory = 10000000000000.B"
mem20tb: "memory = 20000000000000.B"
mem50tb: "memory = 50000000000000.B"
mem100tb: "memory = 100000000000000.B"
mem200tb: "memory = 200000000000000.B"
mem500tb: "memory = 500000000000000.B"
mem1gib: "memory = 1073741824.B"
mem2gib: "memory = 2147483648.B"
mem4gib: "memory = 4294967296.B"
mem8gib: "memory = 8589934592.B"
mem16gib: "memory = 17179869184.B"
mem32gib: "memory = 34359738368.B"
mem64gib: "memory = 68719476736.B"
mem128gib: "memory = 137438953472.B"
mem256gib: "memory = 274877906944.B"
mem512gib: "memory = 549755813888.B"
mem1tib: "memory = 1099511627776.B"
mem2tib: "memory = 2199023255552.B"
mem4tib: "memory = 4398046511104.B"
mem8tib: "memory = 8796093022208.B"
mem16tib: "memory = 17592186044416.B"
mem32tib: "memory = 35184372088832.B"
mem64tib: "memory = 70368744177664.B"
mem128tib: "memory = 140737488355328.B"
mem256tib: "memory = 281474976710656.B"
mem512tib: "memory = 562949953421312.B"
cpu1: "cpus = 1"
cpu2: "cpus = 2"
cpu5: "cpus = 5"
cpu10: "cpus = 10"
cpu20: "cpus = 20"
cpu50: "cpus = 50"
cpu100: "cpus = 100"
cpu200: "cpus = 200"
cpu500: "cpus = 500"
cpu1000: "cpus = 1000"
debug: false
container: "docker"
engines:
- type: "docker"
id: "docker"
image: "python:3.12"
target_registry: "images.viash-hub.com"
target_tag: "main"
namespace_separator: "/"
setup:
- type: "python"
user: false
pip:
- "cutadapt"
upgrade: true
- type: "docker"
run:
- "cutadapt --version | sed 's/\\(.*\\)/cutadapt: \"\\1\"/' > /var/software_versions.txt\n"
entrypoint: []
cmd: null
- type: "native"
id: "native"
build_info:
config: "src/cutadapt/config.vsh.yaml"
runner: "executable"
engine: "docker|native"
output: "target/executable/cutadapt"
executable: "target/executable/cutadapt/cutadapt"
viash_version: "0.9.0"
git_commit: "7f8bcc2b3e1ffaac9778b6acb42420b19660d1a1"
git_remote: "https://x-access-token:ghs_aSDBedV4vU66pddFDN6d8UEy0ZQApn08RAsh@github.com/viash-hub/biobox"
git_tag: "v0.2.0-3-g7f8bcc2"
package_config:
name: "biobox"
version: "main"
description: "A collection of bioinformatics tools for working with sequence data.\n"
info: null
viash_version: "0.9.0"
source: "src"
target: "target"
config_mods:
- ".requirements.commands := ['ps']\n"
- ".engines += { type: \"native\" }"
- ".engines[.type == 'docker'].target_registry := 'images.viash-hub.com'"
- ".engines[.type == 'docker'].target_tag := 'main'"
keywords:
- "bioinformatics"
- "modules"
- "sequencing"
license: "MIT"
organization: "vsh"
links:
repository: "https://github.com/viash-hub/biobox"
issue_tracker: "https://github.com/viash-hub/biobox/issues"