Build pipeline: viash-hub.biobox.main-bv4sf
Source commit: add125261c
Source message: FEAT: avoid using boolean_false (#160)
444 lines
16 KiB
YAML
444 lines
16 KiB
YAML
name: "rsem_prepare_reference"
|
|
namespace: "rsem"
|
|
version: "main"
|
|
authors:
|
|
- name: "Sai Nirmayi Yasa"
|
|
roles:
|
|
- "author"
|
|
- "maintainer"
|
|
info:
|
|
links:
|
|
email: "nirmayi@data-intuitive.com"
|
|
github: "sainirmayi"
|
|
linkedin: "sai-nirmayi-yasa"
|
|
organizations:
|
|
- name: "Data Intuitive"
|
|
href: "https://www.data-intuitive.com"
|
|
role: "Junior Bioinformatics Researcher"
|
|
argument_groups:
|
|
- name: "Inputs"
|
|
arguments:
|
|
- type: "file"
|
|
name: "--reference_fasta_files"
|
|
description: "Semi-colon separated list of Multi-FASTA formatted files OR a directory\
|
|
\ name. If a directory name is specified, RSEM will read all files with suffix\
|
|
\ \".fa\" or \".fasta\" in this directory. The files should contain either the\
|
|
\ sequences of transcripts or an entire genome, depending on whether the '--gtf'\
|
|
\ option is used.\n"
|
|
info: null
|
|
example:
|
|
- "read1.fasta"
|
|
must_exist: true
|
|
create_parent: true
|
|
required: true
|
|
direction: "input"
|
|
multiple: true
|
|
multiple_sep: ";"
|
|
- type: "string"
|
|
name: "--reference_name"
|
|
description: "The name of the reference used. RSEM will generate several reference-related\
|
|
\ files that are prefixed by this name. This name can contain path information\
|
|
\ (e.g. '/ref/mm9').\n"
|
|
info: null
|
|
example:
|
|
- "/ref/mm9"
|
|
required: true
|
|
direction: "input"
|
|
multiple: false
|
|
multiple_sep: ";"
|
|
- name: "Outputs"
|
|
arguments:
|
|
- type: "file"
|
|
name: "--output"
|
|
description: "Directory containing reference files generated by RSEM."
|
|
info: null
|
|
must_exist: true
|
|
create_parent: true
|
|
required: true
|
|
direction: "output"
|
|
multiple: false
|
|
multiple_sep: ";"
|
|
- name: "Other options"
|
|
arguments:
|
|
- type: "file"
|
|
name: "--gtf"
|
|
description: "Assume that 'reference_fasta_files' contains the sequence of a genome,\
|
|
\ and extract transcript reference sequences using the gene annotations specified\
|
|
\ in the GTF file. If this and '--gff3' options are not provided, RSEM will\
|
|
\ assume 'reference_fasta_files' contains the reference transcripts. In this\
|
|
\ case, RSEM assumes that name of each sequence in the Multi-FASTA files is\
|
|
\ its transcript_id."
|
|
info: null
|
|
example:
|
|
- "annotations.gtf"
|
|
must_exist: true
|
|
create_parent: true
|
|
required: false
|
|
direction: "input"
|
|
multiple: false
|
|
multiple_sep: ";"
|
|
- type: "file"
|
|
name: "--gff3"
|
|
description: "GFF3 annotation file. Converted to GTF format with the file name\
|
|
\ 'reference_name.gtf'. Please make sure that 'reference_name.gtf' does not\
|
|
\ exist."
|
|
info: null
|
|
example:
|
|
- "annotations.gff"
|
|
must_exist: true
|
|
create_parent: true
|
|
required: false
|
|
direction: "input"
|
|
multiple: false
|
|
multiple_sep: ";"
|
|
- type: "string"
|
|
name: "--gff3_rna_patterns"
|
|
description: "List of transcript categories (separated by semi-colon). Only transcripts\
|
|
\ that match the string will be extracted."
|
|
info: null
|
|
example:
|
|
- "mRNA;rRNA"
|
|
required: false
|
|
direction: "input"
|
|
multiple: true
|
|
multiple_sep: ";"
|
|
- type: "boolean_true"
|
|
name: "--gff3_genes_as_transcripts"
|
|
description: "This option is designed for untypical organisms, such as viruses,\
|
|
\ whose GFF3 files only contain genes. RSEM will assume each gene as a unique\
|
|
\ transcript when it converts the GFF3 file into GTF format."
|
|
info: null
|
|
direction: "input"
|
|
- type: "string"
|
|
name: "--trusted_sources"
|
|
description: "List of trusted sources (separated by semi-colon). Only transcripts\
|
|
\ coming from these sources will be extracted. If this option is off, all sources\
|
|
\ are accepted."
|
|
info: null
|
|
example:
|
|
- "ENSEMBL;HAVANA"
|
|
required: false
|
|
direction: "input"
|
|
multiple: true
|
|
multiple_sep: ";"
|
|
- type: "file"
|
|
name: "--transcript_to_gene_map"
|
|
description: "Use information from this file to map from transcript (isoform)\
|
|
\ ids to gene ids. Each line of this file should be of the form: \n gene_id\
|
|
\ transcript_id\nwith the two fields separated by a tab character.\nIf you are\
|
|
\ using a GTF file for the \"UCSC Genes\" gene set from the UCSC Genome Browser,\
|
|
\ then the \"knownIsoforms.txt\" file (obtained from the \"Downloads\" section\
|
|
\ of the UCSC Genome Browser site) is of this format. \nIf this option is off,\
|
|
\ then the mapping of isoforms to genes depends on whether the '--gtf' option\
|
|
\ is specified. If '--gtf' is specified, then RSEM uses the \"gene_id\" and\
|
|
\ \"transcript_id\" attributes in the GTF file. Otherwise, RSEM assumes that\
|
|
\ each sequence in the reference sequence files is a separate gene.\n"
|
|
info: null
|
|
example:
|
|
- "isoforms.txt"
|
|
must_exist: true
|
|
create_parent: true
|
|
required: false
|
|
direction: "input"
|
|
multiple: false
|
|
multiple_sep: ";"
|
|
- type: "file"
|
|
name: "--allele_to_gene_map"
|
|
description: "Use information from <file> to provide gene_id and transcript_id\
|
|
\ information for each allele-specific transcript. Each line of <file> should\
|
|
\ be of the form:\n gene_id transcript_id allele_id\nwith the fields separated\
|
|
\ by a tab character.\nThis option is designed for quantifying allele-specific\
|
|
\ expression. It is only valid if '--gtf' option is not specified. allele_id\
|
|
\ should be the sequence names presented in the Multi-FASTA-formatted files.\n"
|
|
info: null
|
|
must_exist: true
|
|
create_parent: true
|
|
required: false
|
|
direction: "input"
|
|
multiple: false
|
|
multiple_sep: ";"
|
|
- type: "boolean_true"
|
|
name: "--polyA"
|
|
description: "Add poly(A) tails to the end of all reference isoforms. The length\
|
|
\ of poly(A) tail added is specified by '--polyA-length' option. STAR aligner\
|
|
\ users may not want to use this option."
|
|
info: null
|
|
direction: "input"
|
|
- type: "integer"
|
|
name: "--polyA_length"
|
|
description: "The length of the poly(A) tails to be added."
|
|
info: null
|
|
example:
|
|
- 125
|
|
required: false
|
|
direction: "input"
|
|
multiple: false
|
|
multiple_sep: ";"
|
|
- type: "file"
|
|
name: "--no_polyA_subset"
|
|
description: "Only meaningful if '--polyA' is specified. Do not add poly(A) tails\
|
|
\ to those transcripts listed in this file containing a list of transcript_ids."
|
|
info: null
|
|
example:
|
|
- "transcript_ids.txt"
|
|
must_exist: true
|
|
create_parent: true
|
|
required: false
|
|
direction: "input"
|
|
multiple: false
|
|
multiple_sep: ";"
|
|
- type: "boolean_true"
|
|
name: "--bowtie"
|
|
description: "Build Bowtie indices."
|
|
info: null
|
|
direction: "input"
|
|
- type: "boolean_true"
|
|
name: "--bowtie2"
|
|
description: "Build Bowtie 2 indices."
|
|
info: null
|
|
direction: "input"
|
|
- type: "boolean_true"
|
|
name: "--star"
|
|
description: "Build STAR indices."
|
|
info: null
|
|
direction: "input"
|
|
- type: "integer"
|
|
name: "--star_sjdboverhang"
|
|
description: "Length of the genomic sequence around annotated junction. It is\
|
|
\ only used for STAR to build splice junctions database and not needed for Bowtie\
|
|
\ or Bowtie2. It will be passed as the --sjdbOverhang option to STAR. According\
|
|
\ to STAR's manual, its ideal value is max(ReadLength)-1, e.g. for 2x101 paired-end\
|
|
\ reads, the ideal value is 101-1=100. In most cases, the default value of 100\
|
|
\ will work as well as the ideal value. (Default is 100)"
|
|
info: null
|
|
example:
|
|
- 100
|
|
required: false
|
|
direction: "input"
|
|
multiple: false
|
|
multiple_sep: ";"
|
|
- type: "boolean_true"
|
|
name: "--hisat2_hca"
|
|
description: "Build HISAT2 indices on the transcriptome according to Human Cell\
|
|
\ Atlas (HCA) SMART-Seq2 pipeline."
|
|
info: null
|
|
direction: "input"
|
|
- type: "boolean_true"
|
|
name: "--quiet"
|
|
alternatives:
|
|
- "-q"
|
|
description: "Suppress the output of logging information."
|
|
info: null
|
|
direction: "input"
|
|
- name: "Prior-enhanced RSEM options"
|
|
arguments:
|
|
- type: "boolean_true"
|
|
name: "--prep_pRSEM"
|
|
description: "A Boolean indicating whether to prepare reference files for pRSEM,\
|
|
\ including building Bowtie indices for a genome and selecting training set\
|
|
\ isoforms. The index files will be used for aligning ChIP-seq reads in prior-enhanced\
|
|
\ RSEM and the training set isoforms will be used for learning prior. A path\
|
|
\ to Bowtie executables and a mappability file in bigWig format are required\
|
|
\ when this option is on. Currently, Bowtie2 is not supported for prior-enhanced\
|
|
\ RSEM."
|
|
info: null
|
|
direction: "input"
|
|
- type: "file"
|
|
name: "--mappability_bigwig_file"
|
|
description: "Full path to a whole-genome mappability file in bigWig format. This\
|
|
\ file is required for running prior-enhanced RSEM. It is used for selecting\
|
|
\ a training set of isoforms for prior-learning. This file can be either downloaded\
|
|
\ from UCSC Genome Browser or generated by GEM (Derrien et al., 2012, PLoS One)."
|
|
info: null
|
|
must_exist: true
|
|
create_parent: true
|
|
required: false
|
|
direction: "input"
|
|
multiple: false
|
|
multiple_sep: ";"
|
|
resources:
|
|
- type: "bash_script"
|
|
path: "script.sh"
|
|
is_executable: true
|
|
description: "RSEM is a software package for estimating gene and isoform expression\
|
|
\ levels from RNA-Seq data. This component prepares transcript references for RSEM.\n"
|
|
test_resources:
|
|
- type: "bash_script"
|
|
path: "test.sh"
|
|
is_executable: true
|
|
info: null
|
|
status: "enabled"
|
|
requirements:
|
|
commands:
|
|
- "ps"
|
|
keywords:
|
|
- "Transcriptome"
|
|
- "Index"
|
|
license: "GPL-3.0"
|
|
references:
|
|
doi:
|
|
- "10.1186/1471-2105-12-323"
|
|
links:
|
|
repository: "https://github.com/deweylab/RSEM"
|
|
homepage: "http://deweylab.github.io/RSEM"
|
|
documentation: "https://deweylab.github.io/RSEM/rsem-prepare-reference.html"
|
|
runners:
|
|
- type: "executable"
|
|
id: "executable"
|
|
docker_setup_strategy: "ifneedbepullelsecachedbuild"
|
|
- type: "nextflow"
|
|
id: "nextflow"
|
|
directives:
|
|
tag: "$id"
|
|
auto:
|
|
simplifyInput: true
|
|
simplifyOutput: false
|
|
transcript: false
|
|
publish: false
|
|
config:
|
|
labels:
|
|
mem1gb: "memory = 1000000000.B"
|
|
mem2gb: "memory = 2000000000.B"
|
|
mem5gb: "memory = 5000000000.B"
|
|
mem10gb: "memory = 10000000000.B"
|
|
mem20gb: "memory = 20000000000.B"
|
|
mem50gb: "memory = 50000000000.B"
|
|
mem100gb: "memory = 100000000000.B"
|
|
mem200gb: "memory = 200000000000.B"
|
|
mem500gb: "memory = 500000000000.B"
|
|
mem1tb: "memory = 1000000000000.B"
|
|
mem2tb: "memory = 2000000000000.B"
|
|
mem5tb: "memory = 5000000000000.B"
|
|
mem10tb: "memory = 10000000000000.B"
|
|
mem20tb: "memory = 20000000000000.B"
|
|
mem50tb: "memory = 50000000000000.B"
|
|
mem100tb: "memory = 100000000000000.B"
|
|
mem200tb: "memory = 200000000000000.B"
|
|
mem500tb: "memory = 500000000000000.B"
|
|
mem1gib: "memory = 1073741824.B"
|
|
mem2gib: "memory = 2147483648.B"
|
|
mem4gib: "memory = 4294967296.B"
|
|
mem8gib: "memory = 8589934592.B"
|
|
mem16gib: "memory = 17179869184.B"
|
|
mem32gib: "memory = 34359738368.B"
|
|
mem64gib: "memory = 68719476736.B"
|
|
mem128gib: "memory = 137438953472.B"
|
|
mem256gib: "memory = 274877906944.B"
|
|
mem512gib: "memory = 549755813888.B"
|
|
mem1tib: "memory = 1099511627776.B"
|
|
mem2tib: "memory = 2199023255552.B"
|
|
mem4tib: "memory = 4398046511104.B"
|
|
mem8tib: "memory = 8796093022208.B"
|
|
mem16tib: "memory = 17592186044416.B"
|
|
mem32tib: "memory = 35184372088832.B"
|
|
mem64tib: "memory = 70368744177664.B"
|
|
mem128tib: "memory = 140737488355328.B"
|
|
mem256tib: "memory = 281474976710656.B"
|
|
mem512tib: "memory = 562949953421312.B"
|
|
cpu1: "cpus = 1"
|
|
cpu2: "cpus = 2"
|
|
cpu5: "cpus = 5"
|
|
cpu10: "cpus = 10"
|
|
cpu20: "cpus = 20"
|
|
cpu50: "cpus = 50"
|
|
cpu100: "cpus = 100"
|
|
cpu200: "cpus = 200"
|
|
cpu500: "cpus = 500"
|
|
cpu1000: "cpus = 1000"
|
|
debug: false
|
|
container: "docker"
|
|
engines:
|
|
- type: "docker"
|
|
id: "docker"
|
|
image: "ubuntu:22.04"
|
|
target_registry: "images.viash-hub.com"
|
|
target_tag: "main"
|
|
namespace_separator: "/"
|
|
setup:
|
|
- type: "apt"
|
|
packages:
|
|
- "build-essential"
|
|
- "gcc"
|
|
- "g++"
|
|
- "make"
|
|
- "wget"
|
|
- "zlib1g-dev"
|
|
- "unzip xxd"
|
|
- "perl"
|
|
- "r-base"
|
|
- "bowtie2"
|
|
- "pip"
|
|
- "git"
|
|
interactive: false
|
|
- type: "python"
|
|
user: false
|
|
packages:
|
|
- "bowtie"
|
|
upgrade: true
|
|
- type: "docker"
|
|
run:
|
|
- "ln -snf /usr/share/zoneinfo/$TZ /etc/localtime && echo $TZ > /etc/timezone\
|
|
\ && \\\ncd /tmp && \\\nwget --no-check-certificate https://github.com/alexdobin/STAR/archive/refs/tags/${STAR_VERSION}.zip\
|
|
\ && \\\nunzip ${STAR_VERSION}.zip && \\\ncd STAR-${STAR_VERSION}/source &&\
|
|
\ \\\nmake STARstatic CXXFLAGS_SIMD=-std=c++11 && \\\ncp STAR /usr/local/bin\
|
|
\ && \\\ncd /tmp && \\\nwget --no-check-certificate https://github.com/deweylab/RSEM/archive/refs/tags/v${RSEM_VERSION}.zip\
|
|
\ && \\\nunzip v${RSEM_VERSION}.zip && \\\ncd RSEM-${RSEM_VERSION} && \\\nmake\
|
|
\ && \\\nmake install && \\\ncd /tmp && \\\nwget --no-check-certificate -O bowtie-${BOWTIE_VERSION}-linux-x86_64.zip\
|
|
\ https://sourceforge.net/projects/bowtie-bio/files/bowtie/${BOWTIE_VERSION}/bowtie-${BOWTIE_VERSION}-linux-x86_64.zip/download\
|
|
\ && \\\nunzip bowtie-${BOWTIE_VERSION}-linux-x86_64.zip && \\\ncp bowtie-${BOWTIE_VERSION}-linux-x86_64/bowtie*\
|
|
\ /usr/local/bin && \\\ncd /tmp && \\\ngit clone https://github.com/DaehwanKimLab/hisat2.git\
|
|
\ /tmp/hisat2 && \\\ncd /tmp/hisat2 && \\\nmake && \\\ncp -r hisat2* /usr/local/bin\
|
|
\ && \\\ncd && \\\nrm -rf /tmp/STAR-${STAR_VERSION} /tmp/${STAR_VERSION}.zip\
|
|
\ /tmp/bowtie-${BOWTIE_VERSION}-linux-x86_64 /tmp/hisat2 && \\\napt-get --purge\
|
|
\ autoremove -y ${PACKAGES} && \\\napt-get clean \n"
|
|
env:
|
|
- "STAR_VERSION=2.7.11b"
|
|
- "RSEM_VERSION=1.3.3"
|
|
- "BOWTIE_VERSION=1.3.1"
|
|
- "TZ=Europe/Brussels"
|
|
- type: "docker"
|
|
run:
|
|
- "echo \"RSEM: `rsem-calculate-expression --version | sed -e 's/Current version:\
|
|
\ RSEM v//g'`\" > /var/software_versions.txt && \\\necho \"STAR: `STAR --version`\"\
|
|
\ >> /var/software_versions.txt && \\\necho \"bowtie2: `bowtie2 --version |\
|
|
\ grep -oP '\\d+\\.\\d+\\.\\d+'`\" >> /var/software_versions.txt && \\\necho\
|
|
\ \"bowtie: `bowtie --version | grep -oP 'bowtie-align-s version \\K\\d+\\.\\\
|
|
d+\\.\\d+'`\" >> /var/software_versions.txt && \\\necho \"HISAT2: `hisat2 --version\
|
|
\ | grep -oP 'hisat2-align-s version \\K\\d+\\.\\d+\\.\\d+'`\" >> /var/software_versions.txt\n"
|
|
entrypoint: []
|
|
cmd: null
|
|
- type: "native"
|
|
id: "native"
|
|
build_info:
|
|
config: "src/rsem/rsem_prepare_reference/config.vsh.yaml"
|
|
runner: "executable"
|
|
engine: "docker|native"
|
|
output: "target/executable/rsem/rsem_prepare_reference"
|
|
executable: "target/executable/rsem/rsem_prepare_reference/rsem_prepare_reference"
|
|
viash_version: "0.9.0"
|
|
git_commit: "add125261c6fa0ed7c9906fc85e7368d2072c4a3"
|
|
git_remote: "https://x-access-token:ghs_bKg5UFS6ueiPm6KFjDzlVauquRQ8Fx1iMgnZ@github.com/viash-hub/biobox"
|
|
git_tag: "v0.2.0-9-gadd1252"
|
|
package_config:
|
|
name: "biobox"
|
|
version: "main"
|
|
description: "A collection of bioinformatics tools for working with sequence data.\n"
|
|
info: null
|
|
viash_version: "0.9.0"
|
|
source: "src"
|
|
target: "target"
|
|
config_mods:
|
|
- ".requirements.commands := ['ps']\n"
|
|
- ".engines += { type: \"native\" }"
|
|
- ".engines[.type == 'docker'].target_registry := 'images.viash-hub.com'"
|
|
- ".engines[.type == 'docker'].target_tag := 'main'"
|
|
keywords:
|
|
- "bioinformatics"
|
|
- "modules"
|
|
- "sequencing"
|
|
license: "MIT"
|
|
organization: "vsh"
|
|
links:
|
|
repository: "https://github.com/viash-hub/biobox"
|
|
issue_tracker: "https://github.com/viash-hub/biobox/issues"
|