Files
biobox/target/executable/rsem/rsem_prepare_reference/.config.vsh.yaml
CI 09f5294af0 Build branch main with version main (add1252)
Build pipeline: viash-hub.biobox.main-bv4sf

Source commit: add125261c

Source message: FEAT: avoid using boolean_false (#160)
2024-10-07 09:26:30 +00:00

444 lines
16 KiB
YAML

name: "rsem_prepare_reference"
namespace: "rsem"
version: "main"
authors:
- name: "Sai Nirmayi Yasa"
roles:
- "author"
- "maintainer"
info:
links:
email: "nirmayi@data-intuitive.com"
github: "sainirmayi"
linkedin: "sai-nirmayi-yasa"
organizations:
- name: "Data Intuitive"
href: "https://www.data-intuitive.com"
role: "Junior Bioinformatics Researcher"
argument_groups:
- name: "Inputs"
arguments:
- type: "file"
name: "--reference_fasta_files"
description: "Semi-colon separated list of Multi-FASTA formatted files OR a directory\
\ name. If a directory name is specified, RSEM will read all files with suffix\
\ \".fa\" or \".fasta\" in this directory. The files should contain either the\
\ sequences of transcripts or an entire genome, depending on whether the '--gtf'\
\ option is used.\n"
info: null
example:
- "read1.fasta"
must_exist: true
create_parent: true
required: true
direction: "input"
multiple: true
multiple_sep: ";"
- type: "string"
name: "--reference_name"
description: "The name of the reference used. RSEM will generate several reference-related\
\ files that are prefixed by this name. This name can contain path information\
\ (e.g. '/ref/mm9').\n"
info: null
example:
- "/ref/mm9"
required: true
direction: "input"
multiple: false
multiple_sep: ";"
- name: "Outputs"
arguments:
- type: "file"
name: "--output"
description: "Directory containing reference files generated by RSEM."
info: null
must_exist: true
create_parent: true
required: true
direction: "output"
multiple: false
multiple_sep: ";"
- name: "Other options"
arguments:
- type: "file"
name: "--gtf"
description: "Assume that 'reference_fasta_files' contains the sequence of a genome,\
\ and extract transcript reference sequences using the gene annotations specified\
\ in the GTF file. If this and '--gff3' options are not provided, RSEM will\
\ assume 'reference_fasta_files' contains the reference transcripts. In this\
\ case, RSEM assumes that name of each sequence in the Multi-FASTA files is\
\ its transcript_id."
info: null
example:
- "annotations.gtf"
must_exist: true
create_parent: true
required: false
direction: "input"
multiple: false
multiple_sep: ";"
- type: "file"
name: "--gff3"
description: "GFF3 annotation file. Converted to GTF format with the file name\
\ 'reference_name.gtf'. Please make sure that 'reference_name.gtf' does not\
\ exist."
info: null
example:
- "annotations.gff"
must_exist: true
create_parent: true
required: false
direction: "input"
multiple: false
multiple_sep: ";"
- type: "string"
name: "--gff3_rna_patterns"
description: "List of transcript categories (separated by semi-colon). Only transcripts\
\ that match the string will be extracted."
info: null
example:
- "mRNA;rRNA"
required: false
direction: "input"
multiple: true
multiple_sep: ";"
- type: "boolean_true"
name: "--gff3_genes_as_transcripts"
description: "This option is designed for untypical organisms, such as viruses,\
\ whose GFF3 files only contain genes. RSEM will assume each gene as a unique\
\ transcript when it converts the GFF3 file into GTF format."
info: null
direction: "input"
- type: "string"
name: "--trusted_sources"
description: "List of trusted sources (separated by semi-colon). Only transcripts\
\ coming from these sources will be extracted. If this option is off, all sources\
\ are accepted."
info: null
example:
- "ENSEMBL;HAVANA"
required: false
direction: "input"
multiple: true
multiple_sep: ";"
- type: "file"
name: "--transcript_to_gene_map"
description: "Use information from this file to map from transcript (isoform)\
\ ids to gene ids. Each line of this file should be of the form: \n gene_id\
\ transcript_id\nwith the two fields separated by a tab character.\nIf you are\
\ using a GTF file for the \"UCSC Genes\" gene set from the UCSC Genome Browser,\
\ then the \"knownIsoforms.txt\" file (obtained from the \"Downloads\" section\
\ of the UCSC Genome Browser site) is of this format. \nIf this option is off,\
\ then the mapping of isoforms to genes depends on whether the '--gtf' option\
\ is specified. If '--gtf' is specified, then RSEM uses the \"gene_id\" and\
\ \"transcript_id\" attributes in the GTF file. Otherwise, RSEM assumes that\
\ each sequence in the reference sequence files is a separate gene.\n"
info: null
example:
- "isoforms.txt"
must_exist: true
create_parent: true
required: false
direction: "input"
multiple: false
multiple_sep: ";"
- type: "file"
name: "--allele_to_gene_map"
description: "Use information from <file> to provide gene_id and transcript_id\
\ information for each allele-specific transcript. Each line of <file> should\
\ be of the form:\n gene_id transcript_id allele_id\nwith the fields separated\
\ by a tab character.\nThis option is designed for quantifying allele-specific\
\ expression. It is only valid if '--gtf' option is not specified. allele_id\
\ should be the sequence names presented in the Multi-FASTA-formatted files.\n"
info: null
must_exist: true
create_parent: true
required: false
direction: "input"
multiple: false
multiple_sep: ";"
- type: "boolean_true"
name: "--polyA"
description: "Add poly(A) tails to the end of all reference isoforms. The length\
\ of poly(A) tail added is specified by '--polyA-length' option. STAR aligner\
\ users may not want to use this option."
info: null
direction: "input"
- type: "integer"
name: "--polyA_length"
description: "The length of the poly(A) tails to be added."
info: null
example:
- 125
required: false
direction: "input"
multiple: false
multiple_sep: ";"
- type: "file"
name: "--no_polyA_subset"
description: "Only meaningful if '--polyA' is specified. Do not add poly(A) tails\
\ to those transcripts listed in this file containing a list of transcript_ids."
info: null
example:
- "transcript_ids.txt"
must_exist: true
create_parent: true
required: false
direction: "input"
multiple: false
multiple_sep: ";"
- type: "boolean_true"
name: "--bowtie"
description: "Build Bowtie indices."
info: null
direction: "input"
- type: "boolean_true"
name: "--bowtie2"
description: "Build Bowtie 2 indices."
info: null
direction: "input"
- type: "boolean_true"
name: "--star"
description: "Build STAR indices."
info: null
direction: "input"
- type: "integer"
name: "--star_sjdboverhang"
description: "Length of the genomic sequence around annotated junction. It is\
\ only used for STAR to build splice junctions database and not needed for Bowtie\
\ or Bowtie2. It will be passed as the --sjdbOverhang option to STAR. According\
\ to STAR's manual, its ideal value is max(ReadLength)-1, e.g. for 2x101 paired-end\
\ reads, the ideal value is 101-1=100. In most cases, the default value of 100\
\ will work as well as the ideal value. (Default is 100)"
info: null
example:
- 100
required: false
direction: "input"
multiple: false
multiple_sep: ";"
- type: "boolean_true"
name: "--hisat2_hca"
description: "Build HISAT2 indices on the transcriptome according to Human Cell\
\ Atlas (HCA) SMART-Seq2 pipeline."
info: null
direction: "input"
- type: "boolean_true"
name: "--quiet"
alternatives:
- "-q"
description: "Suppress the output of logging information."
info: null
direction: "input"
- name: "Prior-enhanced RSEM options"
arguments:
- type: "boolean_true"
name: "--prep_pRSEM"
description: "A Boolean indicating whether to prepare reference files for pRSEM,\
\ including building Bowtie indices for a genome and selecting training set\
\ isoforms. The index files will be used for aligning ChIP-seq reads in prior-enhanced\
\ RSEM and the training set isoforms will be used for learning prior. A path\
\ to Bowtie executables and a mappability file in bigWig format are required\
\ when this option is on. Currently, Bowtie2 is not supported for prior-enhanced\
\ RSEM."
info: null
direction: "input"
- type: "file"
name: "--mappability_bigwig_file"
description: "Full path to a whole-genome mappability file in bigWig format. This\
\ file is required for running prior-enhanced RSEM. It is used for selecting\
\ a training set of isoforms for prior-learning. This file can be either downloaded\
\ from UCSC Genome Browser or generated by GEM (Derrien et al., 2012, PLoS One)."
info: null
must_exist: true
create_parent: true
required: false
direction: "input"
multiple: false
multiple_sep: ";"
resources:
- type: "bash_script"
path: "script.sh"
is_executable: true
description: "RSEM is a software package for estimating gene and isoform expression\
\ levels from RNA-Seq data. This component prepares transcript references for RSEM.\n"
test_resources:
- type: "bash_script"
path: "test.sh"
is_executable: true
info: null
status: "enabled"
requirements:
commands:
- "ps"
keywords:
- "Transcriptome"
- "Index"
license: "GPL-3.0"
references:
doi:
- "10.1186/1471-2105-12-323"
links:
repository: "https://github.com/deweylab/RSEM"
homepage: "http://deweylab.github.io/RSEM"
documentation: "https://deweylab.github.io/RSEM/rsem-prepare-reference.html"
runners:
- type: "executable"
id: "executable"
docker_setup_strategy: "ifneedbepullelsecachedbuild"
- type: "nextflow"
id: "nextflow"
directives:
tag: "$id"
auto:
simplifyInput: true
simplifyOutput: false
transcript: false
publish: false
config:
labels:
mem1gb: "memory = 1000000000.B"
mem2gb: "memory = 2000000000.B"
mem5gb: "memory = 5000000000.B"
mem10gb: "memory = 10000000000.B"
mem20gb: "memory = 20000000000.B"
mem50gb: "memory = 50000000000.B"
mem100gb: "memory = 100000000000.B"
mem200gb: "memory = 200000000000.B"
mem500gb: "memory = 500000000000.B"
mem1tb: "memory = 1000000000000.B"
mem2tb: "memory = 2000000000000.B"
mem5tb: "memory = 5000000000000.B"
mem10tb: "memory = 10000000000000.B"
mem20tb: "memory = 20000000000000.B"
mem50tb: "memory = 50000000000000.B"
mem100tb: "memory = 100000000000000.B"
mem200tb: "memory = 200000000000000.B"
mem500tb: "memory = 500000000000000.B"
mem1gib: "memory = 1073741824.B"
mem2gib: "memory = 2147483648.B"
mem4gib: "memory = 4294967296.B"
mem8gib: "memory = 8589934592.B"
mem16gib: "memory = 17179869184.B"
mem32gib: "memory = 34359738368.B"
mem64gib: "memory = 68719476736.B"
mem128gib: "memory = 137438953472.B"
mem256gib: "memory = 274877906944.B"
mem512gib: "memory = 549755813888.B"
mem1tib: "memory = 1099511627776.B"
mem2tib: "memory = 2199023255552.B"
mem4tib: "memory = 4398046511104.B"
mem8tib: "memory = 8796093022208.B"
mem16tib: "memory = 17592186044416.B"
mem32tib: "memory = 35184372088832.B"
mem64tib: "memory = 70368744177664.B"
mem128tib: "memory = 140737488355328.B"
mem256tib: "memory = 281474976710656.B"
mem512tib: "memory = 562949953421312.B"
cpu1: "cpus = 1"
cpu2: "cpus = 2"
cpu5: "cpus = 5"
cpu10: "cpus = 10"
cpu20: "cpus = 20"
cpu50: "cpus = 50"
cpu100: "cpus = 100"
cpu200: "cpus = 200"
cpu500: "cpus = 500"
cpu1000: "cpus = 1000"
debug: false
container: "docker"
engines:
- type: "docker"
id: "docker"
image: "ubuntu:22.04"
target_registry: "images.viash-hub.com"
target_tag: "main"
namespace_separator: "/"
setup:
- type: "apt"
packages:
- "build-essential"
- "gcc"
- "g++"
- "make"
- "wget"
- "zlib1g-dev"
- "unzip xxd"
- "perl"
- "r-base"
- "bowtie2"
- "pip"
- "git"
interactive: false
- type: "python"
user: false
packages:
- "bowtie"
upgrade: true
- type: "docker"
run:
- "ln -snf /usr/share/zoneinfo/$TZ /etc/localtime && echo $TZ > /etc/timezone\
\ && \\\ncd /tmp && \\\nwget --no-check-certificate https://github.com/alexdobin/STAR/archive/refs/tags/${STAR_VERSION}.zip\
\ && \\\nunzip ${STAR_VERSION}.zip && \\\ncd STAR-${STAR_VERSION}/source &&\
\ \\\nmake STARstatic CXXFLAGS_SIMD=-std=c++11 && \\\ncp STAR /usr/local/bin\
\ && \\\ncd /tmp && \\\nwget --no-check-certificate https://github.com/deweylab/RSEM/archive/refs/tags/v${RSEM_VERSION}.zip\
\ && \\\nunzip v${RSEM_VERSION}.zip && \\\ncd RSEM-${RSEM_VERSION} && \\\nmake\
\ && \\\nmake install && \\\ncd /tmp && \\\nwget --no-check-certificate -O bowtie-${BOWTIE_VERSION}-linux-x86_64.zip\
\ https://sourceforge.net/projects/bowtie-bio/files/bowtie/${BOWTIE_VERSION}/bowtie-${BOWTIE_VERSION}-linux-x86_64.zip/download\
\ && \\\nunzip bowtie-${BOWTIE_VERSION}-linux-x86_64.zip && \\\ncp bowtie-${BOWTIE_VERSION}-linux-x86_64/bowtie*\
\ /usr/local/bin && \\\ncd /tmp && \\\ngit clone https://github.com/DaehwanKimLab/hisat2.git\
\ /tmp/hisat2 && \\\ncd /tmp/hisat2 && \\\nmake && \\\ncp -r hisat2* /usr/local/bin\
\ && \\\ncd && \\\nrm -rf /tmp/STAR-${STAR_VERSION} /tmp/${STAR_VERSION}.zip\
\ /tmp/bowtie-${BOWTIE_VERSION}-linux-x86_64 /tmp/hisat2 && \\\napt-get --purge\
\ autoremove -y ${PACKAGES} && \\\napt-get clean \n"
env:
- "STAR_VERSION=2.7.11b"
- "RSEM_VERSION=1.3.3"
- "BOWTIE_VERSION=1.3.1"
- "TZ=Europe/Brussels"
- type: "docker"
run:
- "echo \"RSEM: `rsem-calculate-expression --version | sed -e 's/Current version:\
\ RSEM v//g'`\" > /var/software_versions.txt && \\\necho \"STAR: `STAR --version`\"\
\ >> /var/software_versions.txt && \\\necho \"bowtie2: `bowtie2 --version |\
\ grep -oP '\\d+\\.\\d+\\.\\d+'`\" >> /var/software_versions.txt && \\\necho\
\ \"bowtie: `bowtie --version | grep -oP 'bowtie-align-s version \\K\\d+\\.\\\
d+\\.\\d+'`\" >> /var/software_versions.txt && \\\necho \"HISAT2: `hisat2 --version\
\ | grep -oP 'hisat2-align-s version \\K\\d+\\.\\d+\\.\\d+'`\" >> /var/software_versions.txt\n"
entrypoint: []
cmd: null
- type: "native"
id: "native"
build_info:
config: "src/rsem/rsem_prepare_reference/config.vsh.yaml"
runner: "executable"
engine: "docker|native"
output: "target/executable/rsem/rsem_prepare_reference"
executable: "target/executable/rsem/rsem_prepare_reference/rsem_prepare_reference"
viash_version: "0.9.0"
git_commit: "add125261c6fa0ed7c9906fc85e7368d2072c4a3"
git_remote: "https://x-access-token:ghs_bKg5UFS6ueiPm6KFjDzlVauquRQ8Fx1iMgnZ@github.com/viash-hub/biobox"
git_tag: "v0.2.0-9-gadd1252"
package_config:
name: "biobox"
version: "main"
description: "A collection of bioinformatics tools for working with sequence data.\n"
info: null
viash_version: "0.9.0"
source: "src"
target: "target"
config_mods:
- ".requirements.commands := ['ps']\n"
- ".engines += { type: \"native\" }"
- ".engines[.type == 'docker'].target_registry := 'images.viash-hub.com'"
- ".engines[.type == 'docker'].target_tag := 'main'"
keywords:
- "bioinformatics"
- "modules"
- "sequencing"
license: "MIT"
organization: "vsh"
links:
repository: "https://github.com/viash-hub/biobox"
issue_tracker: "https://github.com/viash-hub/biobox/issues"