biobox/target/executable/rsem/rsem_prepare_reference/.config.vsh.yaml

name: "rsem_prepare_reference"
namespace: "rsem"
version: "main"
authors:
- name: "Sai Nirmayi Yasa"
  roles:
  - "author"
  - "maintainer"
  info:
    links:
      email: "nirmayi@data-intuitive.com"
      github: "sainirmayi"
      linkedin: "sai-nirmayi-yasa"
    organizations:
    - name: "Data Intuitive"
      href: "https://www.data-intuitive.com"
      role: "Junior Bioinformatics Researcher"
argument_groups:
- name: "Inputs"
  arguments:
  - type: "file"
    name: "--reference_fasta_files"
    description: "Semi-colon separated list of Multi-FASTA formatted files OR a directory\
      \ name. If a directory name is specified, RSEM will read all files with suffix\
      \ \".fa\" or \".fasta\" in this directory. The files should contain either the\
      \ sequences of transcripts or an entire genome, depending on whether the '--gtf'\
      \ option is used.\n"
    info: null
    example:
    - "read1.fasta"
    must_exist: true
    create_parent: true
    required: true
    direction: "input"
    multiple: true
    multiple_sep: ";"
  - type: "string"
    name: "--reference_name"
    description: "The name of the reference used. RSEM will generate several reference-related\
      \ files that are prefixed by this name. This name can contain path information\
      \ (e.g. '/ref/mm9').\n"
    info: null
    example:
    - "/ref/mm9"
    required: true
    direction: "input"
    multiple: false
    multiple_sep: ";"
- name: "Outputs"
  arguments:
  - type: "file"
    name: "--output"
    description: "Directory containing reference files generated by RSEM."
    info: null
    must_exist: true
    create_parent: true
    required: true
    direction: "output"
    multiple: false
    multiple_sep: ";"
- name: "Other options"
  arguments:
  - type: "file"
    name: "--gtf"
    description: "Assume that 'reference_fasta_files' contains the sequence of a genome,\
      \ and extract transcript reference sequences using the gene annotations specified\
      \ in the GTF file. If this and '--gff3' options are not provided, RSEM will\
      \ assume 'reference_fasta_files' contains the reference transcripts. In this\
      \ case, RSEM assumes that name of each sequence in the Multi-FASTA files is\
      \ its transcript_id."
    info: null
    example:
    - "annotations.gtf"
    must_exist: true
    create_parent: true
    required: false
    direction: "input"
    multiple: false
    multiple_sep: ";"
  - type: "file"
    name: "--gff3"
    description: "GFF3 annotation file. Converted to GTF format with the file name\
      \ 'reference_name.gtf'. Please make sure that 'reference_name.gtf' does not\
      \ exist."
    info: null
    example:
    - "annotations.gff"
    must_exist: true
    create_parent: true
    required: false
    direction: "input"
    multiple: false
    multiple_sep: ";"
  - type: "string"
    name: "--gff3_rna_patterns"
    description: "List of transcript categories (separated by semi-colon). Only transcripts\
      \ that match the string will be extracted."
    info: null
    example:
    - "mRNA;rRNA"
    required: false
    direction: "input"
    multiple: true
    multiple_sep: ";"
  - type: "boolean_true"
    name: "--gff3_genes_as_transcripts"
    description: "This option is designed for untypical organisms, such as viruses,\
      \ whose GFF3 files only contain genes. RSEM will assume each gene as a unique\
      \ transcript when it converts the GFF3 file into GTF format."
    info: null
    direction: "input"
  - type: "string"
    name: "--trusted_sources"
    description: "List of trusted sources (separated by semi-colon). Only transcripts\
      \ coming from these sources will be extracted. If this option is off, all sources\
      \ are accepted."
    info: null
    example:
    - "ENSEMBL;HAVANA"
    required: false
    direction: "input"
    multiple: true
    multiple_sep: ";"
  - type: "file"
    name: "--transcript_to_gene_map"
    description: "Use information from this file to map from transcript (isoform)\
      \ ids to gene ids. Each line of this file should be of the form: \n  gene_id\
      \ transcript_id\nwith the two fields separated by a tab character.\nIf you are\
      \ using a GTF file for the \"UCSC Genes\" gene set from the UCSC Genome Browser,\
      \ then the \"knownIsoforms.txt\" file (obtained from the \"Downloads\" section\
      \ of the UCSC Genome Browser site) is of this format. \nIf this option is off,\
      \ then the mapping of isoforms to genes depends on whether the '--gtf' option\
      \ is specified. If '--gtf' is specified, then RSEM uses the \"gene_id\" and\
      \ \"transcript_id\" attributes in the GTF file. Otherwise, RSEM assumes that\
      \ each sequence in the reference sequence files is a separate gene.\n"
    info: null
    example:
    - "isoforms.txt"
    must_exist: true
    create_parent: true
    required: false
    direction: "input"
    multiple: false
    multiple_sep: ";"
  - type: "file"
    name: "--allele_to_gene_map"
    description: "Use information from <file> to provide gene_id and transcript_id\
      \ information for each allele-specific transcript. Each line of <file> should\
      \ be of the form:\n  gene_id transcript_id allele_id\nwith the fields separated\
      \ by a tab character.\nThis option is designed for quantifying allele-specific\
      \ expression. It is only valid if '--gtf' option is not specified. allele_id\
      \ should be the sequence names presented in the Multi-FASTA-formatted files.\n"
    info: null
    must_exist: true
    create_parent: true
    required: false
    direction: "input"
    multiple: false
    multiple_sep: ";"
  - type: "boolean_true"
    name: "--polyA"
    description: "Add poly(A) tails to the end of all reference isoforms. The length\
      \ of poly(A) tail added is specified by '--polyA-length' option. STAR aligner\
      \ users may not want to use this option."
    info: null
    direction: "input"
  - type: "integer"
    name: "--polyA_length"
    description: "The length of the poly(A) tails to be added."
    info: null
    example:
    - 125
    required: false
    direction: "input"
    multiple: false
    multiple_sep: ";"
  - type: "file"
    name: "--no_polyA_subset"
    description: "Only meaningful if '--polyA' is specified. Do not add poly(A) tails\
      \ to those transcripts listed in this file containing a list of transcript_ids."
    info: null
    example:
    - "transcript_ids.txt"
    must_exist: true
    create_parent: true
    required: false
    direction: "input"
    multiple: false
    multiple_sep: ";"
  - type: "boolean_true"
    name: "--bowtie"
    description: "Build Bowtie indices."
    info: null
    direction: "input"
  - type: "boolean_true"
    name: "--bowtie2"
    description: "Build Bowtie 2 indices."
    info: null
    direction: "input"
  - type: "boolean_true"
    name: "--star"
    description: "Build STAR indices."
    info: null
    direction: "input"
  - type: "integer"
    name: "--star_sjdboverhang"
    description: "Length of the genomic sequence around annotated junction. It is\
      \ only used for STAR to build splice junctions database and not needed for Bowtie\
      \ or Bowtie2. It will be passed as the --sjdbOverhang option to STAR. According\
      \ to STAR's manual, its ideal value is max(ReadLength)-1, e.g. for 2x101 paired-end\
      \ reads, the ideal value is 101-1=100. In most cases, the default value of 100\
      \ will work as well as the ideal value. (Default is 100)"
    info: null
    example:
    - 100
    required: false
    direction: "input"
    multiple: false
    multiple_sep: ";"
  - type: "boolean_true"
    name: "--hisat2_hca"
    description: "Build HISAT2 indices on the transcriptome according to Human Cell\
      \ Atlas (HCA) SMART-Seq2 pipeline."
    info: null
    direction: "input"
  - type: "boolean_true"
    name: "--quiet"
    alternatives:
    - "-q"
    description: "Suppress the output of logging information."
    info: null
    direction: "input"
- name: "Prior-enhanced RSEM options"
  arguments:
  - type: "boolean_true"
    name: "--prep_pRSEM"
    description: "A Boolean indicating whether to prepare reference files for pRSEM,\
      \ including building Bowtie indices for a genome and selecting training set\
      \ isoforms. The index files will be used for aligning ChIP-seq reads in prior-enhanced\
      \ RSEM and the training set isoforms will be used for learning prior. A path\
      \ to Bowtie executables and a mappability file in bigWig format are required\
      \ when this option is on. Currently, Bowtie2 is not supported for prior-enhanced\
      \ RSEM."
    info: null
    direction: "input"
  - type: "file"
    name: "--mappability_bigwig_file"
    description: "Full path to a whole-genome mappability file in bigWig format. This\
      \ file is required for running prior-enhanced RSEM. It is used for selecting\
      \ a training set of isoforms for prior-learning. This file can be either downloaded\
      \ from UCSC Genome Browser or generated by GEM (Derrien et al., 2012, PLoS One)."
    info: null
    must_exist: true
    create_parent: true
    required: false
    direction: "input"
    multiple: false
    multiple_sep: ";"
resources:
- type: "bash_script"
  path: "script.sh"
  is_executable: true
description: "RSEM is a software package for estimating gene and isoform expression\
  \ levels from RNA-Seq data. This component prepares transcript references for RSEM.\n"
test_resources:
- type: "bash_script"
  path: "test.sh"
  is_executable: true
info: null
status: "enabled"
requirements:
  commands:
  - "ps"
keywords:
- "Transcriptome"
- "Index"
license: "GPL-3.0"
references:
  doi:
  - "10.1186/1471-2105-12-323"
links:
  repository: "https://github.com/deweylab/RSEM"
  homepage: "http://deweylab.github.io/RSEM"
  documentation: "https://deweylab.github.io/RSEM/rsem-prepare-reference.html"
runners:
- type: "executable"
  id: "executable"
  docker_setup_strategy: "ifneedbepullelsecachedbuild"
- type: "nextflow"
  id: "nextflow"
  directives:
    tag: "$id"
  auto:
    simplifyInput: true
    simplifyOutput: false
    transcript: false
    publish: false
  config:
    labels:
      mem1gb: "memory = 1000000000.B"
      mem2gb: "memory = 2000000000.B"
      mem5gb: "memory = 5000000000.B"
      mem10gb: "memory = 10000000000.B"
      mem20gb: "memory = 20000000000.B"
      mem50gb: "memory = 50000000000.B"
      mem100gb: "memory = 100000000000.B"
      mem200gb: "memory = 200000000000.B"
      mem500gb: "memory = 500000000000.B"
      mem1tb: "memory = 1000000000000.B"
      mem2tb: "memory = 2000000000000.B"
      mem5tb: "memory = 5000000000000.B"
      mem10tb: "memory = 10000000000000.B"
      mem20tb: "memory = 20000000000000.B"
      mem50tb: "memory = 50000000000000.B"
      mem100tb: "memory = 100000000000000.B"
      mem200tb: "memory = 200000000000000.B"
      mem500tb: "memory = 500000000000000.B"
      mem1gib: "memory = 1073741824.B"
      mem2gib: "memory = 2147483648.B"
      mem4gib: "memory = 4294967296.B"
      mem8gib: "memory = 8589934592.B"
      mem16gib: "memory = 17179869184.B"
      mem32gib: "memory = 34359738368.B"
      mem64gib: "memory = 68719476736.B"
      mem128gib: "memory = 137438953472.B"
      mem256gib: "memory = 274877906944.B"
      mem512gib: "memory = 549755813888.B"
      mem1tib: "memory = 1099511627776.B"
      mem2tib: "memory = 2199023255552.B"
      mem4tib: "memory = 4398046511104.B"
      mem8tib: "memory = 8796093022208.B"
      mem16tib: "memory = 17592186044416.B"
      mem32tib: "memory = 35184372088832.B"
      mem64tib: "memory = 70368744177664.B"
      mem128tib: "memory = 140737488355328.B"
      mem256tib: "memory = 281474976710656.B"
      mem512tib: "memory = 562949953421312.B"
      cpu1: "cpus = 1"
      cpu2: "cpus = 2"
      cpu5: "cpus = 5"
      cpu10: "cpus = 10"
      cpu20: "cpus = 20"
      cpu50: "cpus = 50"
      cpu100: "cpus = 100"
      cpu200: "cpus = 200"
      cpu500: "cpus = 500"
      cpu1000: "cpus = 1000"
  debug: false
  container: "docker"
engines:
- type: "docker"
  id: "docker"
  image: "ubuntu:22.04"
  target_registry: "images.viash-hub.com"
  target_tag: "main"
  namespace_separator: "/"
  setup:
  - type: "apt"
    packages:
    - "build-essential"
    - "gcc"
    - "g++"
    - "make"
    - "wget"
    - "zlib1g-dev"
    - "unzip xxd"
    - "perl"
    - "r-base"
    - "bowtie2"
    - "pip"
    - "git"
    interactive: false
  - type: "python"
    user: false
    packages:
    - "bowtie"
    upgrade: true
  - type: "docker"
    run:
    - "ln -snf /usr/share/zoneinfo/$TZ /etc/localtime && echo $TZ > /etc/timezone\
      \ && \\\ncd /tmp && \\\nwget --no-check-certificate https://github.com/alexdobin/STAR/archive/refs/tags/${STAR_VERSION}.zip\
      \ && \\\nunzip ${STAR_VERSION}.zip && \\\ncd STAR-${STAR_VERSION}/source &&\
      \ \\\nmake STARstatic CXXFLAGS_SIMD=-std=c++11 && \\\ncp STAR /usr/local/bin\
      \ && \\\ncd /tmp && \\\nwget --no-check-certificate https://github.com/deweylab/RSEM/archive/refs/tags/v${RSEM_VERSION}.zip\
      \ && \\\nunzip v${RSEM_VERSION}.zip && \\\ncd RSEM-${RSEM_VERSION} && \\\nmake\
      \ && \\\nmake install && \\\ncd /tmp && \\\nwget --no-check-certificate -O bowtie-${BOWTIE_VERSION}-linux-x86_64.zip\
      \ https://sourceforge.net/projects/bowtie-bio/files/bowtie/${BOWTIE_VERSION}/bowtie-${BOWTIE_VERSION}-linux-x86_64.zip/download\
      \  && \\\nunzip bowtie-${BOWTIE_VERSION}-linux-x86_64.zip && \\\ncp bowtie-${BOWTIE_VERSION}-linux-x86_64/bowtie*\
      \ /usr/local/bin && \\\ncd /tmp && \\\ngit clone https://github.com/DaehwanKimLab/hisat2.git\
      \ /tmp/hisat2 && \\\ncd /tmp/hisat2 && \\\nmake && \\\ncp -r hisat2* /usr/local/bin\
      \ && \\\ncd && \\\nrm -rf /tmp/STAR-${STAR_VERSION} /tmp/${STAR_VERSION}.zip\
      \ /tmp/bowtie-${BOWTIE_VERSION}-linux-x86_64 /tmp/hisat2 && \\\napt-get --purge\
      \ autoremove -y ${PACKAGES} && \\\napt-get clean \n"
    env:
    - "STAR_VERSION=2.7.11b"
    - "RSEM_VERSION=1.3.3"
    - "BOWTIE_VERSION=1.3.1"
    - "TZ=Europe/Brussels"
  - type: "docker"
    run:
    - "echo \"RSEM: `rsem-calculate-expression --version | sed -e 's/Current version:\
      \ RSEM v//g'`\" > /var/software_versions.txt && \\\necho \"STAR: `STAR --version`\"\
      \ >> /var/software_versions.txt && \\\necho \"bowtie2: `bowtie2 --version |\
      \ grep -oP '\\d+\\.\\d+\\.\\d+'`\" >> /var/software_versions.txt && \\\necho\
      \ \"bowtie: `bowtie --version | grep -oP 'bowtie-align-s version \\K\\d+\\.\\\
      d+\\.\\d+'`\" >> /var/software_versions.txt && \\\necho \"HISAT2: `hisat2 --version\
      \ | grep -oP 'hisat2-align-s version \\K\\d+\\.\\d+\\.\\d+'`\" >> /var/software_versions.txt\n"
  entrypoint: []
  cmd: null
- type: "native"
  id: "native"
build_info:
  config: "src/rsem/rsem_prepare_reference/config.vsh.yaml"
  runner: "executable"
  engine: "docker|native"
  output: "target/executable/rsem/rsem_prepare_reference"
  executable: "target/executable/rsem/rsem_prepare_reference/rsem_prepare_reference"
  viash_version: "0.9.0"
  git_commit: "add125261c6fa0ed7c9906fc85e7368d2072c4a3"
  git_remote: "https://x-access-token:ghs_bKg5UFS6ueiPm6KFjDzlVauquRQ8Fx1iMgnZ@github.com/viash-hub/biobox"
  git_tag: "v0.2.0-9-gadd1252"
package_config:
  name: "biobox"
  version: "main"
  description: "A collection of bioinformatics tools for working with sequence data.\n"
  info: null
  viash_version: "0.9.0"
  source: "src"
  target: "target"
  config_mods:
  - ".requirements.commands := ['ps']\n"
  - ".engines += { type: \"native\" }"
  - ".engines[.type == 'docker'].target_registry := 'images.viash-hub.com'"
  - ".engines[.type == 'docker'].target_tag := 'main'"
  keywords:
  - "bioinformatics"
  - "modules"
  - "sequencing"
  license: "MIT"
  organization: "vsh"
  links:
    repository: "https://github.com/viash-hub/biobox"
    issue_tracker: "https://github.com/viash-hub/biobox/issues"