Build branch main with version main (1e1ffb3)

Build pipeline: vsh-ci-dev-jsbwk Source commit: 1e1ffb315f Source message: Merge pull request #17 from viash-hub/add_biobox_modules - Migrate a number of components to biobox - Fix tests - Reduce size of test resources - Prepare for Viash Hub
2024-09-13 07:41:13 +00:00
commit 1ebb61f1e8
557 changed files with 430700 additions and 0 deletions
--- a/target/executable/dupradar/.config.vsh.yaml
+++ b/target/executable/dupradar/.config.vsh.yaml
@@ -0,0 +1,277 @@
+name: "dupradar"
+version: "main"
+argument_groups:
+- name: "Input"
+  arguments:
+  - type: "string"
+    name: "--id"
+    description: "Sample ID"
+    info: null
+    required: false
+    direction: "input"
+    multiple: false
+    multiple_sep: ";"
+  - type: "file"
+    name: "--input"
+    description: "path to input alignment file in BAM format"
+    info: null
+    must_exist: true
+    create_parent: true
+    required: true
+    direction: "input"
+    multiple: false
+    multiple_sep: ";"
+  - type: "file"
+    name: "--gtf_annotation"
+    description: "path to GTF annotation file."
+    info: null
+    must_exist: true
+    create_parent: true
+    required: true
+    direction: "input"
+    multiple: false
+    multiple_sep: ";"
+  - type: "boolean"
+    name: "--paired"
+    description: "add flag if input alignment file consists of paired reads"
+    info: null
+    required: false
+    direction: "input"
+    multiple: false
+    multiple_sep: ";"
+  - type: "string"
+    name: "--strandedness"
+    description: "strandedness of input bam file reads (forward, reverse or unstranded\
+      \ (default, applicable to paired reads))"
+    info: null
+    required: false
+    choices:
+    - "forward"
+    - "reverse"
+    - "unstranded"
+    direction: "input"
+    multiple: false
+    multiple_sep: ";"
+- name: "Output"
+  arguments:
+  - type: "file"
+    name: "--output_dupmatrix"
+    description: "path to output file (txt) of duplicate tag counts"
+    info: null
+    default:
+    - "$id.dup_matrix.txt"
+    must_exist: true
+    create_parent: true
+    required: false
+    direction: "output"
+    multiple: false
+    multiple_sep: ";"
+  - type: "file"
+    name: "--output_dup_intercept_mqc"
+    description: "path to output file (txt) of multiqc intercept value DupRadar"
+    info: null
+    default:
+    - "$id.dup_intercept_mqc.txt"
+    must_exist: true
+    create_parent: true
+    required: false
+    direction: "output"
+    multiple: false
+    multiple_sep: ";"
+  - type: "file"
+    name: "--output_duprate_exp_boxplot"
+    description: "path to output file (pdf) of distribution of expression box plot"
+    info: null
+    default:
+    - "$id.duprate_exp_boxplot.pdf"
+    must_exist: true
+    create_parent: true
+    required: false
+    direction: "output"
+    multiple: false
+    multiple_sep: ";"
+  - type: "file"
+    name: "--output_duprate_exp_densplot"
+    description: "path to output file (pdf) of 2D density scatter plot of duplicate\
+      \ tag counts"
+    info: null
+    default:
+    - "$id.duprate_exp_densityplot.pdf"
+    must_exist: true
+    create_parent: true
+    required: false
+    direction: "output"
+    multiple: false
+    multiple_sep: ";"
+  - type: "file"
+    name: "--output_duprate_exp_denscurve_mqc"
+    description: "path to output file (pdf) of density curve of gene duplication multiqc"
+    info: null
+    default:
+    - "$id.duprate_exp_density_curve_mqc.txt"
+    must_exist: true
+    create_parent: true
+    required: false
+    direction: "output"
+    multiple: false
+    multiple_sep: ";"
+  - type: "file"
+    name: "--output_expression_histogram"
+    description: "path to output file (pdf) of distribution of RPK values per gene\
+      \ histogram"
+    info: null
+    default:
+    - "$id.expression_hist.pdf"
+    must_exist: true
+    create_parent: true
+    required: false
+    direction: "output"
+    multiple: false
+    multiple_sep: ";"
+  - type: "file"
+    name: "--output_intercept_slope"
+    description: "output file (txt) with progression of duplication rate value"
+    info: null
+    default:
+    - "$id.intercept_slope.txt"
+    must_exist: true
+    create_parent: true
+    required: false
+    direction: "output"
+    multiple: false
+    multiple_sep: ";"
+resources:
+- type: "bash_script"
+  path: "script.sh"
+  is_executable: true
+- type: "file"
+  path: "dupradar.r"
+description: "Assessment of duplication rates in RNA-Seq datasets\n"
+test_resources:
+- type: "bash_script"
+  path: "test.sh"
+  is_executable: true
+- type: "file"
+  path: "wgEncodeCaltechRnaSeqGm12878R1x75dAlignsRep2V2.bam"
+- type: "file"
+  path: "genes.gtf"
+info:
+  migration_info:
+    git_repo: "https://github.com/nf-core/rnaseq.git"
+    paths:
+    - "modules/local/dupradar.nf"
+    last_sha: "54721c6946daf6d602d7069dc127deef9cbe6b33"
+status: "enabled"
+requirements:
+  commands:
+  - "ps"
+runners:
+- type: "executable"
+  id: "executable"
+  docker_setup_strategy: "ifneedbepullelsecachedbuild"
+- type: "nextflow"
+  id: "nextflow"
+  directives:
+    tag: "$id"
+  auto:
+    simplifyInput: true
+    simplifyOutput: false
+    transcript: false
+    publish: false
+  config:
+    labels:
+      mem1gb: "memory = 1000000000.B"
+      mem2gb: "memory = 2000000000.B"
+      mem5gb: "memory = 5000000000.B"
+      mem10gb: "memory = 10000000000.B"
+      mem20gb: "memory = 20000000000.B"
+      mem50gb: "memory = 50000000000.B"
+      mem100gb: "memory = 100000000000.B"
+      mem200gb: "memory = 200000000000.B"
+      mem500gb: "memory = 500000000000.B"
+      mem1tb: "memory = 1000000000000.B"
+      mem2tb: "memory = 2000000000000.B"
+      mem5tb: "memory = 5000000000000.B"
+      mem10tb: "memory = 10000000000000.B"
+      mem20tb: "memory = 20000000000000.B"
+      mem50tb: "memory = 50000000000000.B"
+      mem100tb: "memory = 100000000000000.B"
+      mem200tb: "memory = 200000000000000.B"
+      mem500tb: "memory = 500000000000000.B"
+      mem1gib: "memory = 1073741824.B"
+      mem2gib: "memory = 2147483648.B"
+      mem4gib: "memory = 4294967296.B"
+      mem8gib: "memory = 8589934592.B"
+      mem16gib: "memory = 17179869184.B"
+      mem32gib: "memory = 34359738368.B"
+      mem64gib: "memory = 68719476736.B"
+      mem128gib: "memory = 137438953472.B"
+      mem256gib: "memory = 274877906944.B"
+      mem512gib: "memory = 549755813888.B"
+      mem1tib: "memory = 1099511627776.B"
+      mem2tib: "memory = 2199023255552.B"
+      mem4tib: "memory = 4398046511104.B"
+      mem8tib: "memory = 8796093022208.B"
+      mem16tib: "memory = 17592186044416.B"
+      mem32tib: "memory = 35184372088832.B"
+      mem64tib: "memory = 70368744177664.B"
+      mem128tib: "memory = 140737488355328.B"
+      mem256tib: "memory = 281474976710656.B"
+      mem512tib: "memory = 562949953421312.B"
+      cpu1: "cpus = 1"
+      cpu2: "cpus = 2"
+      cpu5: "cpus = 5"
+      cpu10: "cpus = 10"
+      cpu20: "cpus = 20"
+      cpu50: "cpus = 50"
+      cpu100: "cpus = 100"
+      cpu200: "cpus = 200"
+      cpu500: "cpus = 500"
+      cpu1000: "cpus = 1000"
+  debug: false
+  container: "docker"
+engines:
+- type: "docker"
+  id: "docker"
+  image: "ubuntu:22.04"
+  target_registry: "images.viash-hub.com"
+  target_tag: "main"
+  namespace_separator: "/"
+  setup:
+  - type: "apt"
+    packages:
+    - "r-base"
+    interactive: false
+  - type: "r"
+    bioc:
+    - "dupRadar"
+    bioc_force_install: false
+  entrypoint: []
+  cmd: null
+- type: "native"
+  id: "native"
+build_info:
+  config: "src/dupradar/config.vsh.yaml"
+  runner: "executable"
+  engine: "docker|native"
+  output: "target/executable/dupradar"
+  executable: "target/executable/dupradar/dupradar"
+  viash_version: "0.9.0"
+  git_commit: "1e1ffb315fefec05db2ee0c62e1c98ce4b49929c"
+  git_remote: "https://github.com/viash-hub/rnaseq"
+package_config:
+  version: "main"
+  info:
+    test_resources:
+    - path: "gs://viash-hub-test-data/rnaseq/v1"
+      dest: "testData"
+  viash_version: "0.9.0"
+  source: "src"
+  target: "target"
+  config_mods:
+  - ".requirements.commands := ['ps']\n.runners[.type == 'nextflow'].directives.tag\
+    \ := '$id'\n"
+  - ".engines += { type: \"native\" }"
+  - ".engines[.type == 'docker'].target_registry := 'images.viash-hub.com'"
+  - ".engines[.type == 'docker'].target_tag := 'main'"
+  organization: "vsh"
--- a/target/executable/dupradar/dupradar
+++ b/target/executable/dupradar/dupradar
--- a/target/executable/dupradar/dupradar.r
+++ b/target/executable/dupradar/dupradar.r
@@ -0,0 +1,154 @@
+#!/usr/bin/env Rscript
+
+# Command line argument processing
+args = commandArgs(trailingOnly=TRUE)
+if (length(args) < 5) {
+    stop("Usage: dupRadar.r <input.bam> <sample_id> <annotation.gtf> <strandDirection:0=unstranded/1=forward/2=reverse> <paired/single> <nbThreads> <R-package-location (optional)>", call.=FALSE)
+}
+
+message("paired_end is", args[5])
+message("the type is is", class(args[5]))
+
+input_bam <- args[1]
+output_prefix <- args[2]
+annotation_gtf <- args[3]
+stranded <- as.numeric(args[4])
+paired_end <- ifelse(args[5] == "true", TRUE, FALSE)
+threads <- as.numeric(args[6])
+
+bamRegex <- "(.+)\\.bam$"
+
+if(!(grepl(bamRegex, input_bam) && file.exists(input_bam) &&  (!file.info(input_bam)$isdir))) stop("First argument '<input.bam>' must be an existing file (not a directory) with '.bam' extension...")
+if(!(file.exists(annotation_gtf) &&  (!file.info(annotation_gtf)$isdir))) stop("Third argument '<annotation.gtf>' must be an existing file (and not a directory)...")
+if(is.na(stranded) || (!(stranded %in% (0:2)))) stop("Fourth argument <strandDirection> must be a numeric value in 0(unstranded)/1(forward)/2(reverse)...")
+if(is.na(threads) || (threads<=0)) stop("Fifth argument <nbThreads> must be a strictly positive numeric value...")
+
+# Debug messages (stderr)
+message("Input bam      (Arg 1): ", input_bam)
+message("Output basename(Arg 2): ", output_prefix)
+message("Input gtf      (Arg 3): ", annotation_gtf)
+message("Strandness     (Arg 4): ", c("unstranded", "forward", "reverse")[stranded+1])
+message("paired_end     (Arg 5): ", paired_end)
+message("Nb threads     (Arg 6): ", threads)
+message("R package loc. (Arg 7): ", ifelse(length(args) > 4, args[5], "Not specified"))
+
+
+# Load / install packages
+if (length(args) > 5) { .libPaths( c( args[6], .libPaths() ) ) }
+if (!require("dupRadar")){
+    source("http://bioconductor.org/biocLite.R")
+    biocLite("dupRadar", suppressUpdates=TRUE)
+    library("dupRadar")
+}
+if (!require("parallel")) {
+    install.packages("parallel", dependencies=TRUE, repos='http://cloud.r-project.org/')
+    library("parallel")
+}
+
+# Duplicate stats
+dm <- analyzeDuprates(input_bam, annotation_gtf, stranded, paired_end, threads)
+write.table(dm, file=paste(output_prefix, "_dupMatrix.txt", sep=""), quote=F, row.name=F, sep="\t")
+
+# 2D density scatter plot
+pdf(paste0(output_prefix, "_duprateExpDens.pdf"))
+duprateExpDensPlot(DupMat=dm)
+title("Density scatter plot")
+mtext(output_prefix, side=3)
+dev.off()
+fit <- duprateExpFit(DupMat=dm)
+cat(
+    paste("- dupRadar Int (duprate at low read counts):", fit$intercept),
+    paste("- dupRadar Sl (progression of the duplication rate):", fit$slope),
+    fill=TRUE, labels=output_prefix,
+    file=paste0(output_prefix, "_intercept_slope.txt"), append=FALSE
+)
+
+# Create a multiqc file dupInt
+sample_name <- gsub("Aligned.sortedByCoord.out.markDups", "", output_prefix)
+line="#id: DupInt
+#plot_type: 'generalstats'
+#pconfig:
+#    dupRadar_intercept:
+#        title: 'dupInt'
+#        namespace: 'DupRadar'
+#        description: 'Intercept value from DupRadar'
+#        max: 100
+#        min: 0
+#        scale: 'RdYlGn-rev'
+#        format: '{:.2f}%'
+Sample dupRadar_intercept"
+
+write(line,file=paste0(output_prefix, "_dup_intercept_mqc.txt"),append=TRUE)
+write(paste(sample_name, fit$intercept),file=paste0(output_prefix, "_dup_intercept_mqc.txt"),append=TRUE)
+
+# Get numbers from dupRadar GLM
+curve_x <- sort(log10(dm$RPK))
+curve_y = 100*predict(fit$glm, data.frame(x=curve_x), type="response")
+# Remove all of the infinite values
+infs = which(curve_x %in% c(-Inf,Inf))
+curve_x = curve_x[-infs]
+curve_y = curve_y[-infs]
+# Reduce number of data points
+curve_x <- curve_x[seq(1, length(curve_x), 10)]
+curve_y <- curve_y[seq(1, length(curve_y), 10)]
+# Convert x values back to real counts
+curve_x = 10^curve_x
+# Write to file
+line="#id: dupradar
+#section_name: 'DupRadar'
+#section_href: 'bioconductor.org/packages/release/bioc/html/dupRadar.html'
+#description: \"provides duplication rate quality control for RNA-Seq datasets. Highly expressed genes can be expected to have a lot of duplicate reads, but high numbers of duplicates at low read counts can indicate low library complexity with technical duplication.
+#    This plot shows the general linear models - a summary of the gene duplication distributions. \"
+#pconfig:
+#    title: 'DupRadar General Linear Model'
+#    xLog: True
+#    xlab: 'expression (reads/kbp)'
+#    ylab: '% duplicate reads'
+#    ymax: 100
+#    ymin: 0
+#    tt_label: '<b>{point.x:.1f} reads/kbp</b>: {point.y:,.2f}% duplicates'
+#    xPlotLines:
+#        - color: 'green'
+#          dashStyle: 'LongDash'
+#          label:
+#                style: {color: 'green'}
+#                text: '0.5 RPKM'
+#                verticalAlign: 'bottom'
+#                y: -65
+#          value: 0.5
+#          width: 1
+#        - color: 'red'
+#          dashStyle: 'LongDash'
+#          label:
+#                style: {color: 'red'}
+#                text: '1 read/bp'
+#                verticalAlign: 'bottom'
+#                y: -65
+#          value: 1000
+#          width: 1"
+
+write(line,file=paste0(output_prefix, "_duprateExpDensCurve_mqc.txt"),append=TRUE)
+write.table(
+    cbind(curve_x, curve_y),
+    file=paste0(output_prefix, "_duprateExpDensCurve_mqc.txt"),
+    quote=FALSE, row.names=FALSE, col.names=FALSE, append=TRUE,
+)
+
+# Distribution of expression box plot
+pdf(paste0(output_prefix, "_duprateExpBoxplot.pdf"))
+duprateExpBoxplot(DupMat=dm)
+title("Percent Duplication by Expression")
+mtext(output_prefix, side=3)
+dev.off()
+
+# Distribution of RPK values per gene
+pdf(paste0(output_prefix, "_expressionHist.pdf"))
+expressionHist(DupMat=dm)
+title("Distribution of RPK values per gene")
+mtext(output_prefix, side=3)
+dev.off()
+
+# Print sessioninfo to standard out
+print(output_prefix)
+citation("dupRadar")
+sessionInfo()