Files
biobox/target/executable/fastp/.config.vsh.yaml
CI 6f2f840fd9 Build branch main with version main (7f8bcc2)
Build pipeline: viash-hub.biobox.main-zp6tq

Source commit: 7f8bcc2b3e

Source message: BD rhapsody sequence analysis (#96)

* wip

* fix test

* add help

* update 2.2 args

* fix bug

* extend test data

* output separate files

* analyse missing args

* tweaks to test

* fix script

* fix test

* fix test

* move small reference

* wip generate wta test data

* don't forget about umi in r1

* remove unneeded pkg

* load reference in memory just once

* fix random choices

* extend test

* add abc immunediscoverypanel

* wip abc testing code

* fix abc test; need unique instrument, run and flowcell ids for each sample

* add smk data

* add entry to changelog

* remove old test file

* adapt test for missing read

* update description

* add comment

* ensure cwl files are absolute

* Apply suggestions from code review

Co-authored-by: Dries Schaumont <5946712+DriesSchaumont@users.noreply.github.com>

* fix suggestion

* newer pipelines have docker requirements as a hint instead of a strict requirement

* rename str to content

* remove deleted resources

* fix containers

* fix script

* fix suggestion

* fix suggestion...

* fix test

* fix component name

* fix test

* apply suggestions

* fix test

* added note

* fix changelog

* fix changelog again

* splitting hairs here

---------

Co-authored-by: Dries Schaumont <5946712+DriesSchaumont@users.noreply.github.com>
2024-09-17 10:13:18 +00:00

1111 lines
34 KiB
YAML

name: "fastp"
version: "main"
authors:
- name: "Robrecht Cannoodt"
roles:
- "author"
- "maintainer"
info:
links:
email: "robrecht@data-intuitive.com"
github: "rcannood"
orcid: "0000-0003-3641-729X"
linkedin: "robrechtcannoodt"
organizations:
- name: "Data Intuitive"
href: "https://www.data-intuitive.com"
role: "Data Science Engineer"
- name: "Open Problems"
href: "https://openproblems.bio"
role: "Core Member"
argument_groups:
- name: "Inputs"
description: "`fastp` supports both single-end (SE) and paired-end (PE) input.\n\
\n- for SE data, you only have to specify read1 input by `-i` or `--in1`.\n- for\
\ PE data, you should also specify read2 input by `-I` or `--in2`.\n"
arguments:
- type: "file"
name: "--in1"
alternatives:
- "-i"
description: "Input FastQ file. Must be single-end or paired-end R1. Can be gzipped."
info: null
example:
- "in.R1.fq.gz"
must_exist: true
create_parent: true
required: true
direction: "input"
multiple: false
multiple_sep: ";"
- type: "file"
name: "--in2"
alternatives:
- "-I"
description: "Input FastQ file. Must be paired-end R2. Can be gzipped."
info: null
example:
- "in.R2.fq.gz"
must_exist: true
create_parent: true
required: false
direction: "input"
multiple: false
multiple_sep: ";"
- name: "Outputs"
description: "\n- for SE data, you only have to specify read1 output by `-o` or\
\ `--out1`.\n- for PE data, you should also specify read2 output by `-O` or `--out2`.\n\
- if you don't specify the output file names, no output files will be written,\
\ but the QC will still be done for both data before and after filtering.\n- the\
\ output will be gzip-compressed if its file name ends with `.gz`\n"
arguments:
- type: "file"
name: "--out1"
alternatives:
- "-o"
description: "The single-end or paired-end R1 reads that pass QC. Will be gzipped\
\ if its file name ends with `.gz`."
info: null
example:
- "out.R1.fq.gz"
must_exist: true
create_parent: true
required: true
direction: "output"
multiple: false
multiple_sep: ";"
- type: "file"
name: "--out2"
alternatives:
- "-O"
description: "The paired-end R2 reads that pass QC. Will be gzipped if its file\
\ name ends with `.gz`."
info: null
example:
- "out.R2.fq.gz"
must_exist: true
create_parent: true
required: false
direction: "output"
multiple: false
multiple_sep: ";"
- type: "file"
name: "--unpaired1"
description: "Store the reads that `read1` passes filters but its paired `read2`\
\ doesn't."
info: null
example:
- "unpaired.R1.fq.gz"
must_exist: true
create_parent: true
required: false
direction: "output"
multiple: false
multiple_sep: ";"
- type: "file"
name: "--unpaired2"
description: "Store the reads that `read2` passes filters but its paired `read1`\
\ doesn't."
info: null
example:
- "unpaired.R2.fq.gz"
must_exist: true
create_parent: true
required: false
direction: "output"
multiple: false
multiple_sep: ";"
- type: "file"
name: "--failed_out"
description: "Store the reads that fail filters.\n\nIf one read failed and is\
\ written to --failed_out, its failure reason will be appended to its read name.\
\ For example, failed_quality_filter, failed_too_short etc.\nFor PE data, if\
\ unpaired reads are not stored (by giving --unpaired1 or --unpaired2), the\
\ failed pair of reads will be put together. If one read passes the filters\
\ but its pair doesn't, the failure reason will be paired_read_is_failing.\n"
info: null
example:
- "failed.fq.gz"
must_exist: true
create_parent: true
required: false
direction: "output"
multiple: false
multiple_sep: ";"
- type: "file"
name: "--overlapped_out"
description: "For each read pair, output the overlapped region if it has no any\
\ mismatched base.\n"
info: null
must_exist: true
create_parent: true
required: false
direction: "output"
multiple: false
multiple_sep: ";"
- name: "Report output arguments"
arguments:
- type: "file"
name: "--json"
alternatives:
- "-j"
description: "The json format report file name\n"
info: null
example:
- "out.json"
must_exist: true
create_parent: true
required: false
direction: "output"
multiple: false
multiple_sep: ";"
- type: "file"
name: "--html"
description: "The html format report file name\n"
info: null
example:
- "out.html"
must_exist: true
create_parent: true
required: false
direction: "output"
multiple: false
multiple_sep: ";"
- type: "string"
name: "--report_title"
description: "The title of the html report, default is \"fastp report\".\n"
info: null
example:
- "fastp report"
required: false
direction: "input"
multiple: false
multiple_sep: ";"
- name: "Adapter trimming"
description: "Adapter trimming is enabled by default, but you can disable it by\
\ `-A` or `--disable_adapter_trimming`. Adapter sequences can be automatically\
\ detected for both PE/SE data.\n\n- For SE data, the adapters are evaluated by\
\ analyzing the tails of first ~1M reads. This evaluation may be inacurrate, and\
\ you can specify the adapter sequence by `-a` or `--adapter_sequence` option.\
\ If adapter sequence is specified, the auto detection for SE data will be disabled.\n\
- For PE data, the adapters can be detected by per-read overlap analysis, which\
\ seeks for the overlap of each pair of reads. This method is robust and fast,\
\ so normally you don't have to input the adapter sequence even you know it. But\
\ you can still specify the adapter sequences for read1 by `--adapter_sequence`,\
\ and for read2 by `--adapter_sequence_r2`. If `fastp` fails to find an overlap\
\ (i.e. due to low quality bases), it will use these sequences to trim adapters\
\ for read1 and read2 respectively.\n- For PE data, the adapter sequence auto-detection\
\ is disabled by default since the adapters can be trimmed by overlap analysis.\
\ However, you can specify `--detect_adapter_for_pe` to enable it.\n- For PE data,\
\ `fastp` will run a little slower if you specify the sequence adapters or enable\
\ adapter auto-detection, but usually result in a slightly cleaner output, since\
\ the overlap analysis may fail due to sequencing errors or adapter dimers.\n\
- The most widely used adapter is the Illumina TruSeq adapters. If your data is\
\ from the TruSeq library, you can add `--adapter_sequence=AGATCGGAAGAGCACACGTCTGAACTCCAGTCA\
\ --adapter_sequence_r2=AGATCGGAAGAGCGTCGTGTAGGGAAAGAGTGT` to your command lines,\
\ or enable auto detection for PE data by specifing `detect_adapter_for_pe`.\n\
- `fastp` contains some built-in known adapter sequences for better auto-detection.\
\ If you want to make some adapters to be a part of the built-in adapters, please\
\ file an issue.\n\nYou can also specify --adapter_fasta to give a FASTA file\
\ to tell fastp to trim multiple adapters in this FASTA file. Here is a sample\
\ of such adapter FASTA file:\n\n```\n>Illumina TruSeq Adapter Read 1\nAGATCGGAAGAGCACACGTCTGAACTCCAGTCA\n\
>Illumina TruSeq Adapter Read 2\nAGATCGGAAGAGCGTCGTGTAGGGAAAGAGTGT\n>polyA\nAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA\n\
```\n\nThe adapter sequence in this file should be at least 6bp long, otherwise\
\ it will be skipped. And you can give whatever you want to trim, rather than\
\ regular sequencing adapters (i.e. polyA).\n\n`fastp` first trims the auto-detected\
\ adapter or the adapter sequences given by `--adapter_sequence | --adapter_sequence_r2`,\
\ then trims the adapters given by `--adapter_fasta` one by one.\n\nThe sequence\
\ distribution of trimmed adapters can be found at the HTML/JSON reports.\n"
arguments:
- type: "boolean_true"
name: "--disable_adapter_trimming"
alternatives:
- "-A"
description: "Disable adapter trimming.\n"
info: null
direction: "input"
- type: "boolean_true"
name: "--detect_adapter_for_pe"
description: "By default, the auto-detection for adapter is for SE data input\
\ only, turn on this option to enable it for PE data.\n"
info: null
direction: "input"
- type: "string"
name: "--adapter_sequence"
alternatives:
- "-a"
description: "The adapter sequences to be trimmed. For SE data, if not specified,\
\ the adapters will be auto-detected. For PE data, this is used if R1/R2 are\
\ found not overlapped\n"
info: null
required: false
direction: "input"
multiple: false
multiple_sep: ";"
- type: "string"
name: "--adapter_sequence_r2"
description: "The adapter sequences to be trimmed for R2. This is used for PE\
\ data if R1/R2 are found overlapped.\n"
info: null
required: false
direction: "input"
multiple: false
multiple_sep: ";"
- type: "file"
name: "--adapter_fasta"
description: "A FASTA file containing all the adapter sequences to be trimmed.\
\ For SE data, if not specified, the adapters will be auto-detected. For PE\
\ data, this is used if R1/R2 are found not overlapped.\n"
info: null
must_exist: true
create_parent: true
required: false
direction: "input"
multiple: false
multiple_sep: ";"
- name: "Base trimming"
arguments:
- type: "integer"
name: "--trim_front1"
alternatives:
- "-f"
description: "Trimming how many bases in front for read1, default is 0.\n"
info: null
example:
- 0
required: false
direction: "input"
multiple: false
multiple_sep: ";"
- type: "integer"
name: "--trim_tail1"
alternatives:
- "-t"
description: "Trimming how many bases in tail for read1, default is 0.\n"
info: null
example:
- 0
required: false
direction: "input"
multiple: false
multiple_sep: ";"
- type: "integer"
name: "--max_len1"
alternatives:
- "-b"
description: "If read1 is longer than max_len1, then trim read1 at its tail to\
\ make it as long as max_len1. Default 0 means no limitation.\n"
info: null
required: false
min: 0
direction: "input"
multiple: false
multiple_sep: ";"
- type: "integer"
name: "--trim_front2"
alternatives:
- "-F"
description: "Trimming how many bases in front for read2, default is 0.\n"
info: null
example:
- 0
required: false
direction: "input"
multiple: false
multiple_sep: ";"
- type: "integer"
name: "--trim_tail2"
alternatives:
- "-T"
description: "Trimming how many bases in tail for read2, default is 0.\n"
info: null
example:
- 0
required: false
direction: "input"
multiple: false
multiple_sep: ";"
- type: "integer"
name: "--max_len2"
alternatives:
- "-B"
description: "If read2 is longer than max_len2, then trim read2 at its tail to\
\ make it as long as max_len2. Default 0 means no limitation.\n"
info: null
required: false
min: 0
direction: "input"
multiple: false
multiple_sep: ";"
- name: "Merging mode"
description: "Allows merging paired-end reads into a single longer read if they\
\ are overlapping."
arguments:
- type: "boolean_true"
name: "--merge"
alternatives:
- "-m"
description: "For paired-end input, merge each pair of reads into a single read\
\ if they are overlapped. The merged reads will be written to the file given\
\ by --merged_out, the unmerged reads will be written to the files specified\
\ by --out1 and --out2. The merging mode is disabled by default.\n"
info: null
direction: "input"
- type: "file"
name: "--merged_out"
description: "In the merging mode, specify the file name to store merged output,\
\ or specify --stdout to stream the merged output.\n"
info: null
example:
- "merged.fq.gz"
must_exist: true
create_parent: true
required: false
direction: "output"
multiple: false
multiple_sep: ";"
- type: "boolean_true"
name: "--include_unmerged"
description: "In the merging mode, write the unmerged or unpaired reads to the\
\ file specified by --merge. Disabled by default.\n"
info: null
direction: "input"
- name: "Additional input arguments"
description: "Affects how the input is read."
arguments:
- type: "boolean_true"
name: "--interleaved_in"
description: "Indicate that <in1> is an interleaved FASTQ which contains both\
\ read1 and read2. Disabled by default.\n"
info: null
direction: "input"
- type: "boolean_true"
name: "--fix_mgi_id"
description: "The MGI FASTQ ID format is not compatible with many BAM operation\
\ tools, enable this option to fix it.\n"
info: null
direction: "input"
- type: "boolean_true"
name: "--phred64"
alternatives:
- "-6"
description: "Indicate the input is using phred64 scoring (it'll be converted\
\ to phred33, so the output will still be phred33)\n"
info: null
direction: "input"
- name: "Additional output arguments"
description: "Affects how the output is written."
arguments:
- type: "integer"
name: "--compression"
alternatives:
- "-z"
description: "Compression level for gzip output (1 ~ 9). 1 is fastest, 9 is smallest,\
\ default is 4.\n"
info: null
example:
- 4
required: false
min: 1
max: 9
direction: "input"
multiple: false
multiple_sep: ";"
- type: "boolean_true"
name: "--dont_overwrite"
description: "Don't overwrite existing files. Overwritting is allowed by default.\n"
info: null
direction: "input"
- name: "Logging arguments"
arguments:
- type: "boolean_true"
name: "--verbose"
alternatives:
- "-V"
description: "Output verbose log information (i.e. when every 1M reads are processed)."
info: null
direction: "input"
- name: "Processing arguments"
arguments:
- type: "long"
name: "--reads_to_process"
description: "Specify how many reads/pairs to be processed. Default 0 means process\
\ all reads.\n"
info: null
example:
- 1000000
required: false
min: 0
direction: "input"
multiple: false
multiple_sep: ";"
- name: "Deduplication arguments"
arguments:
- type: "boolean_true"
name: "--dedup"
description: "Enable deduplication to drop the duplicated reads/pairs\n"
info: null
direction: "input"
- type: "integer"
name: "--dup_calc_accuracy"
description: "Accuracy level to calculate duplication (1~6). Higher level uses\
\ more memory (1G, 2G, 4G, 8G, 16G, 24G). Default 1 for no-dedup mode, and 3\
\ for dedup mode.\n"
info: null
example:
- 3
required: false
min: 1
max: 6
direction: "input"
multiple: false
multiple_sep: ";"
- type: "boolean_true"
name: "--dont_eval_duplication"
description: "Don't evaluate duplication rate to save time and use less memory.\n"
info: null
direction: "input"
- name: "PolyG tail trimming arguments"
arguments:
- type: "boolean_true"
name: "--trim_poly_g"
alternatives:
- "-g"
description: "Force polyG tail trimming, by default trimming is automatically\
\ enabled for Illumina NextSeq/NovaSeq data\n"
info: null
direction: "input"
- type: "integer"
name: "--poly_g_min_len"
description: "The minimum length to detect polyG in the read tail. 10 by default.\n"
info: null
example:
- 10
required: false
min: 1
direction: "input"
multiple: false
multiple_sep: ";"
- type: "boolean_true"
name: "--disable_trim_poly_g"
alternatives:
- "-G"
description: "Disable polyG tail trimming, by default trimming is automatically\
\ enabled for Illumina NextSeq/NovaSeq data\n"
info: null
direction: "input"
- name: "PolyX tail trimming arguments"
arguments:
- type: "boolean_true"
name: "--trim_poly_x"
alternatives:
- "-x"
description: "Enable polyX trimming in 3' ends.\n"
info: null
direction: "input"
- type: "integer"
name: "--poly_x_min_len"
description: "The minimum length to detect polyX in the read tail. 10 by default.\n"
info: null
example:
- 10
required: false
min: 1
direction: "input"
multiple: false
multiple_sep: ";"
- name: "Cut arguments"
arguments:
- type: "integer"
name: "--cut_front"
alternatives:
- "-5"
description: "Move a sliding window from front (5') to tail, drop the bases in\
\ the window if its mean quality < threshold, stop otherwise.\n"
info: null
required: false
direction: "input"
multiple: false
multiple_sep: ";"
- type: "integer"
name: "--cut_tail"
alternatives:
- "-3"
description: "Move a sliding window from tail (3') to front, drop the bases in\
\ the window if its mean quality < threshold, stop otherwise.\n"
info: null
required: false
direction: "input"
multiple: false
multiple_sep: ";"
- type: "integer"
name: "--cut_right"
alternatives:
- "-r"
description: "Move a sliding window from front to tail, if meet one window with\
\ mean quality < threshold, drop the bases in the window and the right part,\
\ and then stop.\n"
info: null
required: false
direction: "input"
multiple: false
multiple_sep: ";"
- type: "integer"
name: "--cut_window_size"
alternatives:
- "-W"
description: "The window size option shared by cut_front, cut_tail or cut_sliding.\
\ Range: 1~1000, default: 4.\n"
info: null
example:
- 4
required: false
min: 1
direction: "input"
multiple: false
multiple_sep: ";"
- type: "integer"
name: "--cut_mean_quality"
alternatives:
- "-M"
description: "The mean quality requirement option shared by cut_front, cut_tail\
\ or cut_sliding. Range: 1~36 default: 20 (Q20)\n"
info: null
example:
- 20
required: false
min: 0
direction: "input"
multiple: false
multiple_sep: ";"
- type: "integer"
name: "--cut_front_window_size"
description: "The window size option of cut_front, default to cut_window_size\
\ if not specified.\n"
info: null
example:
- 4
required: false
min: 1
direction: "input"
multiple: false
multiple_sep: ";"
- type: "integer"
name: "--cut_front_mean_quality"
description: "The mean quality requirement option of cut_front, default to cut_mean_quality\
\ if not specified.\n"
info: null
example:
- 20
required: false
min: 0
direction: "input"
multiple: false
multiple_sep: ";"
- type: "integer"
name: "--cut_tail_window_size"
description: "The window size option of cut_tail, default to cut_window_size if\
\ not specified.\n"
info: null
example:
- 4
required: false
min: 1
direction: "input"
multiple: false
multiple_sep: ";"
- type: "integer"
name: "--cut_tail_mean_quality"
description: "The mean quality requirement option of cut_tail, default to cut_mean_quality\
\ if not specified.\n"
info: null
example:
- 20
required: false
min: 0
direction: "input"
multiple: false
multiple_sep: ";"
- type: "integer"
name: "--cut_right_window_size"
description: "The window size option of cut_right, default to cut_window_size\
\ if not specified.\n"
info: null
example:
- 4
required: false
min: 1
direction: "input"
multiple: false
multiple_sep: ";"
- type: "integer"
name: "--cut_right_mean_quality"
description: "The mean quality requirement option of cut_right, default to cut_mean_quality\
\ if not specified.\n"
info: null
example:
- 20
required: false
min: 0
direction: "input"
multiple: false
multiple_sep: ";"
- name: "Quality filtering arguments"
arguments:
- type: "boolean_true"
name: "--disable_quality_filtering"
alternatives:
- "-Q"
description: "Quality filtering is enabled by default. If this option is specified,\
\ quality filtering is disabled.\n"
info: null
direction: "input"
- type: "integer"
name: "--qualified_quality_phred"
alternatives:
- "-q"
description: "The quality value that a base is qualified. Default 15 means phred\
\ quality >=Q15 is qualified.\n"
info: null
example:
- 15
required: false
min: 0
direction: "input"
multiple: false
multiple_sep: ";"
- type: "integer"
name: "--unqualified_percent_limit"
alternatives:
- "-u"
description: "How many percents of bases are allowed to be unqualified (0~100).\
\ Default 40 means 40%.\n"
info: null
example:
- 40
required: false
min: 0
max: 100
direction: "input"
multiple: false
multiple_sep: ";"
- type: "integer"
name: "--n_base_limit"
alternatives:
- "-n"
description: "If one read's number of N base is >n_base_limit, then this read/pair\
\ is discarded. Default is 5.\n"
info: null
example:
- 5
required: false
min: 0
direction: "input"
multiple: false
multiple_sep: ";"
- type: "integer"
name: "--average_qual"
alternatives:
- "-e"
description: "If one read's average quality score <avg_qual, then this read/pair\
\ is discarded. Default 0 means no requirement.\n"
info: null
example:
- 0
required: false
min: 0
direction: "input"
multiple: false
multiple_sep: ";"
- name: "Length filtering arguments"
arguments:
- type: "boolean_true"
name: "--disable_length_filtering"
alternatives:
- "-L"
description: "Length filtering is enabled by default. If this option is specified,\
\ length filtering is disabled.\n"
info: null
direction: "input"
- type: "integer"
name: "--length_required"
alternatives:
- "-l"
description: "Reads shorter than length_required will be discarded, default is\
\ 15.\n"
info: null
example:
- 15
required: false
min: 0
direction: "input"
multiple: false
multiple_sep: ";"
- type: "integer"
name: "--length_limit"
description: "Reads longer than length_limit will be discarded, default 0 means\
\ no limitation.\n"
info: null
example:
- 0
required: false
min: 0
direction: "input"
multiple: false
multiple_sep: ";"
- name: "Low complexity filtering arguments"
arguments:
- type: "boolean_true"
name: "--low_complexity_filter"
alternatives:
- "-y"
description: "Enable low complexity filter. The complexity is defined as the percentage\
\ of base that is different from its next base (base[i] != base[i+1]).\n"
info: null
direction: "input"
- type: "integer"
name: "--complexity_threshold"
alternatives:
- "-Y"
description: "The threshold for low complexity filter (0~100). Default is 30,\
\ which means 30% complexity is required.\n"
info: null
example:
- 30
required: false
min: 0
direction: "input"
multiple: false
multiple_sep: ";"
- name: "Index filtering arguments"
arguments:
- type: "file"
name: "--filter_by_index1"
description: "Specify a file contains a list of barcodes of index1 to be filtered\
\ out, one barcode per line.\n"
info: null
must_exist: true
create_parent: true
required: false
direction: "input"
multiple: false
multiple_sep: ";"
- type: "file"
name: "--filter_by_index2"
description: "Specify a file contains a list of barcodes of index2 to be filtered\
\ out, one barcode per line.\n"
info: null
must_exist: true
create_parent: true
required: false
direction: "input"
multiple: false
multiple_sep: ";"
- type: "integer"
name: "--filter_by_index_threshold"
description: "The allowed difference of index barcode for index filtering, default\
\ 0 means completely identical.\n"
info: null
example:
- 0
required: false
min: 0
direction: "input"
multiple: false
multiple_sep: ";"
- name: "Overlapped region correction"
arguments:
- type: "boolean_true"
name: "--correction"
alternatives:
- "-c"
description: "Enable base correction in overlapped regions (only for PE data),\
\ default is disabled.\n"
info: null
direction: "input"
- type: "integer"
name: "--overlap_len_require"
description: "The minimum length to detect overlapped region of PE reads. This\
\ will affect overlap analysis based PE merge, adapter trimming and correction.\
\ 30 by default.\n"
info: null
example:
- 30
required: false
min: 0
direction: "input"
multiple: false
multiple_sep: ";"
- type: "integer"
name: "--overlap_diff_limit"
description: "The maximum number of mismatched bases to detect overlapped region\
\ of PE reads. This will affect overlap analysis based PE merge, adapter trimming\
\ and correction. 5 by default.\n"
info: null
example:
- 5
required: false
min: 0
direction: "input"
multiple: false
multiple_sep: ";"
- type: "integer"
name: "--overlap_diff_percent_limit"
description: "The maximum percentage of mismatched bases to detect overlapped\
\ region of PE reads. This will affect overlap analysis based PE merge, adapter\
\ trimming and correction. Default 20 means 20%.\n"
info: null
example:
- 20
required: false
min: 0
max: 100
direction: "input"
multiple: false
multiple_sep: ";"
- name: "UMI arguments"
arguments:
- type: "boolean_true"
name: "--umi"
alternatives:
- "-U"
description: "Enable unique molecular identifier (UMI) preprocessing.\n"
info: null
direction: "input"
- type: "string"
name: "--umi_loc"
description: "Specify the location of UMI, can be (index1/index2/read1/read2/per_index/per_read,\
\ default is none.\n"
info: null
required: false
choices:
- "index1"
- "index2"
- "read1"
- "read2"
- "per_index"
- "per_read"
direction: "input"
multiple: false
multiple_sep: ";"
- type: "integer"
name: "--umi_len"
description: "If the UMI is in read1/read2, its length should be provided.\n"
info: null
example:
- 0
required: false
min: 0
direction: "input"
multiple: false
multiple_sep: ";"
- type: "string"
name: "--umi_prefix"
description: "If specified, an underline will be used to connect prefix and UMI\
\ (i.e. prefix=UMI, UMI=AATTCG, final=UMI_AATTCG). No prefix by default.\n"
info: null
required: false
direction: "input"
multiple: false
multiple_sep: ";"
- type: "integer"
name: "--umi_skip"
description: "If the UMI is in read1/read2, fastp can skip several bases following\
\ UMI, default is 0.\n"
info: null
example:
- 0
required: false
min: 0
direction: "input"
multiple: false
multiple_sep: ";"
- type: "string"
name: "--umi_delim"
description: "If the UMI is in index1/index2, fastp can use a delimiter to separate\
\ UMI from the read sequence, default is none.\n"
info: null
required: false
direction: "input"
multiple: false
multiple_sep: ";"
- name: "Overrepresentation analysis arguments"
arguments:
- type: "boolean_true"
name: "--overrepresentation_analysis"
alternatives:
- "-p"
description: "Enable overrepresentation analysis.\n"
info: null
direction: "input"
- type: "integer"
name: "--overrepresentation_sampling"
description: "One in (--overrepresentation_sampling) reads will be computed for\
\ overrepresentation analysis (1~10000), smaller is slower, default is 20.\n"
info: null
example:
- 20
required: false
min: 1
direction: "input"
multiple: false
multiple_sep: ";"
resources:
- type: "bash_script"
path: "script.sh"
is_executable: true
description: "An ultra-fast all-in-one FASTQ preprocessor (QC/adapters/trimming/filtering/splitting/merging...).\n\
\nFeatures:\n\n - comprehensive quality profiling for both before and after filtering\
\ data (quality curves, base contents, KMER, Q20/Q30, GC Ratio, duplication, adapter\
\ contents...)\n - filter out bad reads (too low quality, too short, or too many\
\ N...)\n - cut low quality bases for per read in its 5' and 3' by evaluating the\
\ mean quality from a sliding window (like Trimmomatic but faster).\n - trim all\
\ reads in front and tail\n - cut adapters. Adapter sequences can be automatically\
\ detected, which means you don't have to input the adapter sequences to trim them.\n\
\ - correct mismatched base pairs in overlapped regions of paired end reads, if\
\ one base is with high quality while the other is with ultra low quality\n - trim\
\ polyG in 3' ends, which is commonly seen in NovaSeq/NextSeq data. Trim polyX in\
\ 3' ends to remove unwanted polyX tailing (i.e. polyA tailing for mRNA-Seq data)\n\
\ - preprocess unique molecular identifier (UMI) enabled data, shift UMI to sequence\
\ name.\n - report JSON format result for further interpreting.\n - visualize\
\ quality control and filtering results on a single HTML page (like FASTQC but faster\
\ and more informative).\n - split the output to multiple files (0001.R1.gz, 0002.R1.gz...)\
\ to support parallel processing. Two modes can be used, limiting the total split\
\ file number, or limitting the lines of each split file.\n - support long reads\
\ (data from PacBio / Nanopore devices).\n - support reading from STDIN and writing\
\ to STDOUT\n - support interleaved input\n - support ultra-fast FASTQ-level deduplication\n"
test_resources:
- type: "bash_script"
path: "test.sh"
is_executable: true
- type: "file"
path: "test_data"
info: null
status: "enabled"
requirements:
commands:
- "ps"
keywords:
- "RNA-Seq"
- "Trimming"
- "Quality control"
license: "MIT"
references:
doi:
- "10.1093/bioinformatics/bty560"
links:
repository: "https://github.com/OpenGene/fastp"
documentation: "https://github.com/OpenGene/fastp/blob/master/README.md"
runners:
- type: "executable"
id: "executable"
docker_setup_strategy: "ifneedbepullelsecachedbuild"
- type: "nextflow"
id: "nextflow"
directives:
tag: "$id"
auto:
simplifyInput: true
simplifyOutput: false
transcript: false
publish: false
config:
labels:
mem1gb: "memory = 1000000000.B"
mem2gb: "memory = 2000000000.B"
mem5gb: "memory = 5000000000.B"
mem10gb: "memory = 10000000000.B"
mem20gb: "memory = 20000000000.B"
mem50gb: "memory = 50000000000.B"
mem100gb: "memory = 100000000000.B"
mem200gb: "memory = 200000000000.B"
mem500gb: "memory = 500000000000.B"
mem1tb: "memory = 1000000000000.B"
mem2tb: "memory = 2000000000000.B"
mem5tb: "memory = 5000000000000.B"
mem10tb: "memory = 10000000000000.B"
mem20tb: "memory = 20000000000000.B"
mem50tb: "memory = 50000000000000.B"
mem100tb: "memory = 100000000000000.B"
mem200tb: "memory = 200000000000000.B"
mem500tb: "memory = 500000000000000.B"
mem1gib: "memory = 1073741824.B"
mem2gib: "memory = 2147483648.B"
mem4gib: "memory = 4294967296.B"
mem8gib: "memory = 8589934592.B"
mem16gib: "memory = 17179869184.B"
mem32gib: "memory = 34359738368.B"
mem64gib: "memory = 68719476736.B"
mem128gib: "memory = 137438953472.B"
mem256gib: "memory = 274877906944.B"
mem512gib: "memory = 549755813888.B"
mem1tib: "memory = 1099511627776.B"
mem2tib: "memory = 2199023255552.B"
mem4tib: "memory = 4398046511104.B"
mem8tib: "memory = 8796093022208.B"
mem16tib: "memory = 17592186044416.B"
mem32tib: "memory = 35184372088832.B"
mem64tib: "memory = 70368744177664.B"
mem128tib: "memory = 140737488355328.B"
mem256tib: "memory = 281474976710656.B"
mem512tib: "memory = 562949953421312.B"
cpu1: "cpus = 1"
cpu2: "cpus = 2"
cpu5: "cpus = 5"
cpu10: "cpus = 10"
cpu20: "cpus = 20"
cpu50: "cpus = 50"
cpu100: "cpus = 100"
cpu200: "cpus = 200"
cpu500: "cpus = 500"
cpu1000: "cpus = 1000"
debug: false
container: "docker"
engines:
- type: "docker"
id: "docker"
image: "quay.io/biocontainers/fastp:0.23.4--hadf994f_2"
target_registry: "images.viash-hub.com"
target_tag: "main"
namespace_separator: "/"
setup:
- type: "docker"
run:
- "fastp --version 2>&1 | sed 's# #: \"#;s#$#\"#' > /var/software_versions.txt\n"
entrypoint: []
cmd: null
- type: "native"
id: "native"
build_info:
config: "src/fastp/config.vsh.yaml"
runner: "executable"
engine: "docker|native"
output: "target/executable/fastp"
executable: "target/executable/fastp/fastp"
viash_version: "0.9.0"
git_commit: "7f8bcc2b3e1ffaac9778b6acb42420b19660d1a1"
git_remote: "https://x-access-token:ghs_aSDBedV4vU66pddFDN6d8UEy0ZQApn08RAsh@github.com/viash-hub/biobox"
git_tag: "v0.2.0-3-g7f8bcc2"
package_config:
name: "biobox"
version: "main"
description: "A collection of bioinformatics tools for working with sequence data.\n"
info: null
viash_version: "0.9.0"
source: "src"
target: "target"
config_mods:
- ".requirements.commands := ['ps']\n"
- ".engines += { type: \"native\" }"
- ".engines[.type == 'docker'].target_registry := 'images.viash-hub.com'"
- ".engines[.type == 'docker'].target_tag := 'main'"
keywords:
- "bioinformatics"
- "modules"
- "sequencing"
license: "MIT"
organization: "vsh"
links:
repository: "https://github.com/viash-hub/biobox"
issue_tracker: "https://github.com/viash-hub/biobox/issues"