Files
biobox/src/cutadapt/config.vsh.yaml
CI 5fc8f64fcb Build branch update_busco with version update_busco (1ca6dec)
Build pipeline: viash-hub.biobox.update-busco-bsvnf

Source commit: 1ca6dec48e

Source message: Typo
2024-07-01 19:57:00 +00:00

482 lines
18 KiB
YAML

name: cutadapt
description: |
Cutadapt removes adapter sequences from high-throughput sequencing reads.
keywords: [RNA-seq, scRNA-seq, high-throughput]
links:
homepage: https://cutadapt.readthedocs.io
documentation: https://cutadapt.readthedocs.io
repository: https://github.com/marcelm/cutadapt
references:
doi: 10.14806/ej.17.1.200
license: MIT
argument_groups:
####################################################################
- name: Specify Adapters for R1
arguments:
- name: --adapter
alternatives: [-a]
type: string
multiple: true
description: |
Sequence of an adapter ligated to the 3' end (paired data:
of the first read). The adapter and subsequent bases are
trimmed. If a '$' character is appended ('anchoring'), the
adapter is only found if it is a suffix of the read.
required: false
- name: --front
alternatives: [-g]
type: string
multiple: true
description: |
Sequence of an adapter ligated to the 5' end (paired data:
of the first read). The adapter and any preceding bases
are trimmed. Partial matches at the 5' end are allowed. If
a '^' character is prepended ('anchoring'), the adapter is
only found if it is a prefix of the read.
required: false
- name: --anywhere
alternatives: [-b]
type: string
multiple: true
description: |
Sequence of an adapter that may be ligated to the 5' or 3'
end (paired data: of the first read). Both types of
matches as described under -a and -g are allowed. If the
first base of the read is part of the match, the behavior
is as with -g, otherwise as with -a. This option is mostly
for rescuing failed library preparations - do not use if
you know which end your adapter was ligated to!
required: false
####################################################################
- name: Specify Adapters using Fasta files for R1
arguments:
- name: --adapter_fasta
type: file
multiple: true
description: |
Fasta file containing sequences of an adapter ligated to the 3' end (paired data:
of the first read). The adapter and subsequent bases are
trimmed. If a '$' character is appended ('anchoring'), the
adapter is only found if it is a suffix of the read.
required: false
- name: --front_fasta
type: file
description: |
Fasta file containing sequences of an adapter ligated to the 5' end (paired data:
of the first read). The adapter and any preceding bases
are trimmed. Partial matches at the 5' end are allowed. If
a '^' character is prepended ('anchoring'), the adapter is
only found if it is a prefix of the read.
required: false
- name: --anywhere_fasta
type: file
description: |
Fasta file containing sequences of an adapter that may be ligated to the 5' or 3'
end (paired data: of the first read). Both types of
matches as described under -a and -g are allowed. If the
first base of the read is part of the match, the behavior
is as with -g, otherwise as with -a. This option is mostly
for rescuing failed library preparations - do not use if
you know which end your adapter was ligated to!
required: false
####################################################################
- name: Specify Adapters for R2
arguments:
- name: --adapter_r2
alternatives: [-A]
type: string
multiple: true
description: |
Sequence of an adapter ligated to the 3' end (paired data:
of the first read). The adapter and subsequent bases are
trimmed. If a '$' character is appended ('anchoring'), the
adapter is only found if it is a suffix of the read.
required: false
- name: --front_r2
alternatives: [-G]
type: string
multiple: true
description: |
Sequence of an adapter ligated to the 5' end (paired data:
of the first read). The adapter and any preceding bases
are trimmed. Partial matches at the 5' end are allowed. If
a '^' character is prepended ('anchoring'), the adapter is
only found if it is a prefix of the read.
required: false
- name: --anywhere_r2
alternatives: [-B]
type: string
multiple: true
description: |
Sequence of an adapter that may be ligated to the 5' or 3'
end (paired data: of the first read). Both types of
matches as described under -a and -g are allowed. If the
first base of the read is part of the match, the behavior
is as with -g, otherwise as with -a. This option is mostly
for rescuing failed library preparations - do not use if
you know which end your adapter was ligated to!
required: false
####################################################################
- name: Specify Adapters using Fasta files for R2
arguments:
- name: --adapter_r2_fasta
type: file
description: |
Fasta file containing sequences of an adapter ligated to the 3' end (paired data:
of the first read). The adapter and subsequent bases are
trimmed. If a '$' character is appended ('anchoring'), the
adapter is only found if it is a suffix of the read.
required: false
- name: --front_r2_fasta
type: file
description: |
Fasta file containing sequences of an adapter ligated to the 5' end (paired data:
of the first read). The adapter and any preceding bases
are trimmed. Partial matches at the 5' end are allowed. If
a '^' character is prepended ('anchoring'), the adapter is
only found if it is a prefix of the read.
required: false
- name: --anywhere_r2_fasta
type: file
description: |
Fasta file containing sequences of an adapter that may be ligated to the 5' or 3'
end (paired data: of the first read). Both types of
matches as described under -a and -g are allowed. If the
first base of the read is part of the match, the behavior
is as with -g, otherwise as with -a. This option is mostly
for rescuing failed library preparations - do not use if
you know which end your adapter was ligated to!
required: false
####################################################################
- name: Paired-end options
arguments:
- name: --pair_adapters
type: boolean_true
description: |
Treat adapters given with -a/-A etc. as pairs. Either both
or none are removed from each read pair.
- name: --pair_filter
type: string
choices: [any, both, first]
description: |
Which of the reads in a paired-end read have to match the
filtering criterion in order for the pair to be filtered.
- name: --interleaved
type: boolean_true
description: |
Read and/or write interleaved paired-end reads.
####################################################################
- name: Input parameters
arguments:
- name: --input
type: file
required: true
description: |
Input fastq file for single-end reads or R1 for paired-end reads.
- name: --input_r2
type: file
required: false
description: |
Input fastq file for R2 in the case of paired-end reads.
- name: --error_rate
alternatives: [-E, --errors]
type: double
description: |
Maximum allowed error rate (if 0 <= E < 1), or absolute
number of errors for full-length adapter match (if E is an
integer >= 1). Error rate = no. of errors divided by
length of matching region. Default: 0.1 (10%).
example: 0.1
- name: --no_indels
type: boolean_false
description: |
Allow only mismatches in alignments.
- name: --times
type: integer
alternatives: [-n]
description: |
Remove up to COUNT adapters from each read. Default: 1.
example: 1
- name: --overlap
alternatives: [-O]
type: integer
description: |
Require MINLENGTH overlap between read and adapter for an
adapter to be found. The default is 3.
example: 3
- name: --match_read_wildcards
type: boolean_true
description: |
Interpret IUPAC wildcards in reads.
- name: --no_match_adapter_wildcards
type: boolean_false
description: |
Do not interpret IUPAC wildcards in adapters.
- name: --action
type: string
choices:
- trim
- retain
- mask
- lowercase
- none
description: |
What to do if a match was found. trim: trim adapter and
up- or downstream sequence; retain: trim, but retain
adapter; mask: replace with 'N' characters; lowercase:
convert to lowercase; none: leave unchanged.
The default is trim.
example: trim
- name: --revcomp
alternatives: [--rc]
type: boolean_true
description: |
Check both the read and its reverse complement for adapter
matches. If match is on reverse-complemented version,
output that one.
####################################################################
- name: "Demultiplexing options"
arguments:
- name: "--demultiplex_mode"
type: string
choices: ["single", "unique_dual", "combinatorial_dual"]
required: false
description: |
Enable demultiplexing and set the mode for it.
With mode 'unique_dual', adapters from the first and second read are used,
and the indexes from the reads are only used in pairs. This implies
--pair_adapters.
Enabling mode 'combinatorial_dual' allows all combinations of the sets of indexes
on R1 and R2. It is necessary to write each read pair to an output
file depending on the adapters found on both R1 and R2.
Mode 'single', uses indexes or barcodes located at the 5'
end of the R1 read (single).
####################################################################
- name: Read modifications
arguments:
- name: --cut
alternatives: [-u]
type: integer
multiple: true
description: |
Remove LEN bases from each read (or R1 if paired; use --cut_r2
option for R2). If LEN is positive, remove bases from the
beginning. If LEN is negative, remove bases from the end.
Can be used twice if LENs have different signs. Applied
*before* adapter trimming.
- name: --cut_r2
type: integer
multiple: true
description: |
Remove LEN bases from each read (for R2). If LEN is positive, remove bases from the
beginning. If LEN is negative, remove bases from the end.
Can be used twice if LENs have different signs. Applied
*before* adapter trimming.
- name: --nextseq_trim
type: string
description: |
NextSeq-specific quality trimming (each read). Trims also
dark cycles appearing as high-quality G bases.
- name: --quality_cutoff
alternatives: [-q]
type: string
description: |
Trim low-quality bases from 5' and/or 3' ends of each read
before adapter removal. Applied to both reads if data is
paired. If one value is given, only the 3' end is trimmed.
If two comma-separated cutoffs are given, the 5' end is
trimmed with the first cutoff, the 3' end with the second.
- name: --quality_cutoff_r2
alternatives: [-Q]
type: string
description: |
Quality-trimming cutoff for R2. Default: same as for R1
- name: --quality_base
type: integer
description: |
Assume that quality values in FASTQ are encoded as
ascii(quality + N). This needs to be set to 64 for some
old Illumina FASTQ files. The default is 33.
example: 33
- name: --poly_a
type: boolean_true
description: Trim poly-A tails
- name: --length
alternatives: [-l]
type: integer
description: |
Shorten reads to LENGTH. Positive values remove bases at
the end while negative ones remove bases at the beginning.
This and the following modifications are applied after
adapter trimming.
- name: --trim_n
type: boolean_true
description: Trim N's on ends of reads.
- name: --length_tag
type: string
description: |
Search for TAG followed by a decimal number in the
description field of the read. Replace the decimal number
with the correct length of the trimmed read. For example,
use --length-tag 'length=' to correct fields like
'length=123'.
example: "length="
- name: --strip_suffix
type: string
description: |
Remove this suffix from read names if present. Can be
given multiple times.
- name: --prefix
alternatives: [-x]
type: string
description: |
Add this prefix to read names. Use {name} to insert the
name of the matching adapter.
- name: --suffix
alternatives: [-y]
type: string
description: |
Add this suffix to read names; can also include {name}
- name: --rename
type: string
description: |
Rename reads using TEMPLATE containing variables such as
{id}, {adapter_name} etc. (see documentation)
- name: --zero_cap
alternatives: [-z]
type: boolean_true
description: Change negative quality values to zero.
####################################################################
- name: Filtering of processed reads
description: |
Filters are applied after above read modifications. Paired-end reads are
always discarded pairwise (see also --pair_filter).
arguments:
- name: --minimum_length
alternatives: [-m]
type: string
description: |
Discard reads shorter than LEN. Default is 0.
When trimming paired-end reads, the minimum lengths for R1 and R2 can be specified separately by separating them with a colon (:).
If the colon syntax is not used, the same minimum length applies to both reads, as discussed above.
Also, one of the values can be omitted to impose no restrictions.
For example, with -m 17:, the length of R1 must be at least 17, but the length of R2 is ignored.
example: "0"
- name: --maximum_length
alternatives: [-M]
type: string
description: |
Discard reads longer than LEN. Default: no limit.
For paired reads, see the remark for --minimum_length
- name: --max_n
type: string
description: |
Discard reads with more than COUNT 'N' bases. If COUNT is
a number between 0 and 1, it is interpreted as a fraction
of the read length.
- name: --max_expected_errors
alternatives: [--max_ee]
type: long
description: |
Discard reads whose expected number of errors (computed
from quality values) exceeds ERRORS.
- name: --max_average_error_rate
alternatives: [--max_aer]
type: long
description: |
as --max_expected_errors (see above), but divided by
length to account for reads of varying length.
- name: --discard_trimmed
alternatives: [--discard]
type: boolean_true
description: |
Discard reads that contain an adapter. Use also -O to
avoid discarding too many randomly matching reads.
- name: --discard_untrimmed
alternatives: [--trimmed_only]
type: boolean_true
description: |
Discard reads that do not contain an adapter.
- name: --discard_casava
type: boolean_true
description: |
Discard reads that did not pass CASAVA filtering (header
has :Y:).
####################################################################
- name: Output parameters
arguments:
- name: --report
type: string
choices: [full, minimal]
description: |
Which type of report to print: 'full' (default) or 'minimal'.
example: full
- name: --json
type: boolean_true
description: |
Write report in JSON format to this file.
- name: --output
type: file
description: |
Glob pattern for matching the expected output files.
Should include `$output_dir`.
example: "fastq/*_001.fast[a,q]"
direction: output
required: true
must_exist: true
multiple: true
- name: --fasta
type: boolean_true
description: |
Output FASTA to standard output even on FASTQ input.
- name: --info_file
type: boolean_true
description: |
Write information about each read and its adapter matches
into info.txt in the output directory.
See the documentation for the file format.
# - name: -Z
# - name: --rest_file
# - name: --wildcard-file
# - name: --too_short_output
# - name: --too_long_output
# - name: --untrimmed_output
# - name: --untrimmed_paired_output
# - name: too_short_paired_output
# - name: too_long_paired_output
- name: Debug
arguments:
- type: boolean_true
name: --debug
description: Print debug information
resources:
- type: bash_script
path: script.sh
test_resources:
- type: bash_script
path: test.sh
engines:
- type: docker
image: python:3.12
setup:
- type: python
pip:
- cutadapt
- type: docker
run: |
cutadapt --version | sed 's/\(.*\)/cutadapt: "\1"/' > /var/software_versions.txt
runners:
- type: executable
- type: nextflow