Build branch fix_20 with version fix_20 (7ae6be6)

Build pipeline: viash-hub.htrnaseq.fix-20-jv9qq Source commit: 7ae6be67bd Source message: Update CHANGELOG.md
2024-12-18 13:19:28 +00:00
commit 1d3c6a62e0
183 changed files with 92019 additions and 0 deletions
--- a/.gitignore
+++ b/.gitignore
@@ -0,0 +1,18 @@
+target
+testData
+
+# Nextflow related files
+.nextflow
+.nextflow.log*
+work
+
+# Python related files
+*__pycache__*
+.venv
+
+# R related files
+.Rproj.user
+htrnaseq.Rproj
+
+# vscode
+.vscode
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -0,0 +1,15 @@
+# demultiplex v0.x.x
+
+# Minor changes
+
+* Use `v0.2.0` version of cutadapt instead of `main` (PR #23).
+
+# demultiplex v0.2.0
+
+# New functionality
+
+* Make sure that the Well ID matches the required format (PR #22 and PR #21). 
+
+# demultiplex v0.1.0
+
+Initial release
--- a/README.md
+++ b/README.md
@@ -0,0 +1,129 @@
+# HT-RNAseq - A pipeline for processing high-throughput RNA-seq data
+
+## Introduction
+__TODO__: Add a description of the pipeline here.
+
+## Test data
+
+As test data, we use [a DRUGseq dataset](https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc=GSE176150) from the [NCBI Sequence Read Archive](https://www.ncbi.nlm.nih.gov/sra).
+
+The original data has been (partly) subsampled to reduce the test runtime. We used [seqtk](https://github.com/lh3/seqtk) for this with a seed of 1, e.g.:
+
+```bash
+seqtk sample -s1 orig/SRR14730302/VH02001614_S8_R1_001.fastq.gz 10000 > 10k/SRR14730302/VH02001614_S8_R1_001.fastq.gz
+```
+
+The data is available at: `gs://viash-hub-test-data/htrnaseq/v1/`:
+
+```
+❯ gcstree -f viash-hub-test-data/htrnaseq/v1/
+viash-hub-test-data
+└── htrnaseq
+    └── v1
+        ├── [  48]  2-wells.fasta
+        ├── [465.3K]  GSE176150_metadata.csv
+        ├── 100k
+        │   ├── SRR14730301
+        │   │   ├── [8.5M]  VH02001612_S9_R1_001.fastq
+        │   │   └── [14.9M]  VH02001612_S9_R2_001.fastq
+        │   └── SRR14730302
+        │       ├── [8.5M]  VH02001614_S8_R1_001.fastq.gz
+        │       └── [14.9M]  VH02001614_S8_R2_001.fastq.gz
+        ├── 10k
+        │   ├── SRR14730301
+        │   │   ├── [845.4K]  VH02001612_S9_R1_001.fastq
+        │   │   └── [1.5M]  VH02001612_S9_R2_001.fastq
+        │   └── SRR14730302
+        │       ├── [845.3K]  VH02001614_S8_R1_001.fastq.gz
+        │       └── [1.5M]  VH02001614_S8_R2_001.fastq.gz
+        └── orig
+            ├── [20.4G]  SRR14730301
+            │   └── [20.4G]  SRR14730301
+            ├── SRR14730301
+            │   ├── [9.1G]  VH02001612_S9_R1_001.fastq.gz
+            │   └── [22.0G]  VH02001612_S9_R2_001.fastq.gz
+            ├── [16.9G]  SRR14730302
+            │   └── [16.9G]  SRR14730302
+            ├── SRR14730302
+            │   ├── [7.6G]  VH02001614_S8_R1_001.fastq.gz
+            │   └── [18.0G]  VH02001614_S8_R2_001.fastq.gz
+            ├── [18.0G]  SRR14730303
+            │   └── [18.0G]  SRR14730303
+            ├── SRR14730303
+            │   ├── [8.1G]  VH02001618_S7_R1_001.fastq.gz
+            │   └── [19.2G]  VH02001618_S7_R2_001.fastq.gz
+            ├── [16.5G]  SRR14730304
+            │   └── [16.5G]  SRR14730304
+            ├── SRR14730304
+            │   ├── [7.5G]  VH02001700_S6_R1_001.fastq.gz
+            │   └── [17.8G]  VH02001700_S6_R2_001.fastq.gz
+            ├── [19.0G]  SRR14730305
+            │   └── [19.0G]  SRR14730305
+            ├── SRR14730305
+            │   ├── [8.4G]  VH02001702_S5_R1_001.fastq.gz
+            │   └── [20.6G]  VH02001702_S5_R2_001.fastq.gz
+            ├── [14.6G]  SRR14730306
+            │   └── [14.6G]  SRR14730306
+            ├── SRR14730306
+            │   ├── [6.6G]  VH02001704_S4_R1_001.fastq.gz
+            │   └── [16.0G]  VH02001704_S4_R2_001.fastq.gz
+            ├── [21.5G]  SRR14730307
+            │   └── [21.5G]  SRR14730307
+            ├── SRR14730307
+            │   ├── [9.6G]  VH02001708_S3_R1_001.fastq.gz
+            │   └── [23.2G]  VH02001708_S3_R2_001.fastq.gz
+            ├── [20.7G]  SRR14730308
+            │   └── [20.7G]  SRR14730308
+            ├── SRR14730308
+            │   ├── [9.3G]  VH02001710_S2_R1_001.fastq.gz
+            │   └── [22.1G]  VH02001710_S2_R2_001.fastq.gz
+            ├── [15.8G]  SRR14730309
+            │   └── [15.8G]  SRR14730309
+            └── SRR14730309
+                ├── [7.2G]  VH02001712_S1_R1_001.fastq.gz
+                └── [16.9G]  VH02001712_S1_R2_001.fastq.gz
+
+18 directories, 37 files
+```
+
+
+The `orig` directory contains the original fastq files. The fastq files are available for 10k and 100k subsamples in the `10k` and `100k` directories, respectively.
+
+The `2-wells.fasta` file contains the barcodes for 2 wells.
+
+## Test run
+
+The pipeline can be run by creating a `params.yaml` file like this:
+
+```yaml
+param_list:
+  - input_r1: "gs://viash-hub-test-data/htrnaseq/v1/100k/SRR14730301/VH02001612_S9_R1_001.fastq"
+    input_r2: "gs://viash-hub-test-data/htrnaseq/v1/100k/SRR14730301/VH02001612_S9_R2_001.fastq"
+    genomeDir: "gs://viash-hub-test-data/htrnaseq/v1/genomeDir/gencode.v41.star.sparse"
+    barcodesFasta: "gs://viash-hub-test-data/htrnaseq/v1/2-wells.fasta"
+    id: sample_one
+  - input_r1: "gs://viash-hub-test-data/htrnaseq/v1/100k/SRR14730302/VH02001614_S8_R1_001.fastq"
+    input_r2: "gs://viash-hub-test-data/htrnaseq/v1/100k/SRR14730302/VH02001614_S8_R2_001.fastq"
+    genomeDir: "gs://viash-hub-test-data/htrnaseq/v1/genomeDir/gencode.v41.star.sparse"
+    barcodesFasta: "gs://viash-hub-test-data/htrnaseq/v1/2-wells.fasta"
+    id: sample_two
+```
+
+and then:
+
+```bash
+viash ns build --setup cb
+nextflow run . -main-script target/nextflow/workflows/htrnaseq/main.nf \
+  -profile docker \
+  -c target/nextflow/workflows/htrnaseq/nextflow.config \
+  -params-file params.yaml \
+  -resume \
+  --publish_dir output
+```
+
+Or, by running `src/workflows/htrnaseq/integration_test.sh`.
+
+
+# Special Thanks
+
+Developed in collaboration with Data Intuitive and Open Analytics.
--- a/_viash.yaml
+++ b/_viash.yaml
@@ -0,0 +1,20 @@
+name: htrnaseq
+description: |
+  High-throughput pipeline [WIP]
+license: MIT
+keywords: [bioinformatics, sequence, high-throughput, mapping, counting, pipeline]
+links:
+  issue_tracker: https://github.com/viash-hub/htrnaseq/issues
+  repository: https://github.com/viash-hub/htrnaseq
+
+viash_version: 0.9.0
+
+info:
+  test_resources:
+    - path: gs://viash-hub-test-data/htrnaseq/v1/
+      dest: resources_test
+
+config_mods: |
+  .requirements.commands := ['ps']
+  .runners[.type == 'nextflow'].config.script := 'includeConfig("nextflow_labels.config")'
+  .resources += {path: '/src/config/labels.config', dest: 'nextflow_labels.config'}
--- a/main.nf
+++ b/main.nf
@@ -0,0 +1,3 @@
+workflow {
+print("This is a dummy placeholder for pipeline execution. Please use the corresponding nf files for running pipelines.")
+}
--- a/nextflow.config
+++ b/nextflow.config
@@ -0,0 +1,6 @@
+manifest {
+  name = "htrnaseq"
+  version = "fix_20"
+  defaultBranch = "main"
+  nextflowVersion = "!>=20.12.1-edge"
+}
--- a/src/base/authors/dries_schaumont.yaml
+++ b/src/base/authors/dries_schaumont.yaml
@@ -0,0 +1,11 @@
+name: Dries Schaumont
+info:
+  links:
+    email: dries@data-intuitive.com
+    github: DriesSchaumont
+    orcid: "0000-0002-4389-0440"
+    linkedin: dries-schaumont
+  organizations:
+    - name: Data Intuitive
+      href: https://www.data-intuitive.com
+      role: Data Scientist
--- a/src/base/authors/marijke_van_moerbeke.yaml
+++ b/src/base/authors/marijke_van_moerbeke.yaml
@@ -0,0 +1,10 @@
+name: Marijke Van Moerbeke
+info:
+  links:
+    github: mvanmoerbeke
+    orcid: 0000-0002-3097-5621
+    linkedin: marijke-van-moerbeke-84303a34
+  organizations:
+    - name: OpenAnalytics
+      href: https://www.openanalytics.eu
+      role: Statistical Consultant
--- a/src/base/authors/toni_verbeiren.yaml
+++ b/src/base/authors/toni_verbeiren.yaml
@@ -0,0 +1,10 @@
+name: Toni Verbeiren
+info:
+  role: Core Team Member
+  links:
+    github: tverbeiren
+    linkedin: verbeiren
+  organizations:
+  - name: Data Intuitive
+    href: https://www.data-intuitive.com
+    role: Data Scientist and CEO
--- a/src/config/labels.config
+++ b/src/config/labels.config
@@ -0,0 +1,108 @@
+executor {
+  $k8s {
+    submitRateLimit = '10sec'
+    pollInterval = '1 sec'
+  }
+}
+
+process {
+  container = 'nextflow/bash:latest'
+  
+  // default resources
+  memory = { 8.Gb * task.attempt }
+  cpus = 8
+  maxForks = 36
+
+  // Retry for exit codes that have something to do with memory issues
+  errorStrategy = { task.exitStatus in 137..140 ? 'retry' : 'terminate' }
+  maxRetries = 3
+  maxMemory = 192.GB
+
+  // Resource labels
+  withLabel: verylowcpu { cpus = 2 }
+  withLabel: lowcpu { cpus = 8 }
+  withLabel: midcpu { cpus = 16 }
+  withLabel: highcpu { cpus = 32 }
+  
+  withLabel: verylowmem { memory = { get_memory( 4.GB * task.attempt ) } }
+  withLabel: lowmem { memory = { get_memory( 8.GB * task.attempt ) } }
+  withLabel: midmem { memory = { get_memory( 16.GB * task.attempt ) } }
+  withLabel: highmem { memory = { get_memory( 64.GB * task.attempt ) } }
+
+}
+
+profiles {
+  // detect tempdir
+  tempDir = java.nio.file.Paths.get(
+    System.getenv('NXF_TEMP') ?:
+      System.getenv('VIASH_TEMP') ?: 
+      System.getenv('TEMPDIR') ?: 
+      System.getenv('TMPDIR') ?: 
+      '/tmp'
+  ).toAbsolutePath()
+
+  mount_temp {
+    docker.temp            = tempDir
+    podman.temp            = tempDir
+    charliecloud.temp      = tempDir
+  }
+
+  no_publish {
+    process {
+      withName: '.*' {
+        publishDir = [
+          enabled: false
+        ]
+      }
+    }
+  }
+
+  docker {
+    docker.fixOwnership    = true
+    docker.enabled         = true
+    // docker.userEmulation   = true
+    singularity.enabled    = false
+    podman.enabled         = false
+    shifter.enabled        = false
+    charliecloud.enabled   = false
+  }
+
+  local {
+    // This config is for local processing.
+    process {
+        withName: ".*parallel_map_process" {
+          maxForks = 1
+        }
+        maxMemory = 25.GB
+        withLabel: verylowcpu { cpus = 2 }
+        withLabel: lowcpu { cpus = 4 }
+        withLabel: midcpu { cpus = 6 }
+        withLabel: highcpu { cpus = 8 }
+  
+        withLabel: lowmem { memory = { get_memory( 8.GB * task.attempt ) } }
+        withLabel: midmem { memory = { get_memory( 12.GB * task.attempt ) } }
+        withLabel: highmem { memory = { get_memory( 20.GB * task.attempt ) } }
+    }
+  }
+}
+
+def get_memory(to_compare) {
+    if (!process.containsKey("maxMemory") || !process.maxMemory) {
+      return to_compare
+    }
+
+    try {
+      if (process.containsKey("maxRetries") && process.maxRetries && task.attempt == (process.maxRetries as int)) {
+        return process.maxMemory
+      }
+      else if (to_compare.compareTo(process.maxMemory as nextflow.util.MemoryUnit) == 1) {
+        return max_memory as nextflow.util.MemoryUnit
+      }
+      else {
+        return to_compare
+      }  
+    } catch (all) {
+          println "Error processing memory resources. Please check that process.maxMemory '${process.maxMemory}' and process.maxRetries '${process.maxRetries}' are valid!"
+          System.exit(1)
+    }
+  }
--- a/src/eset/create_eset/config.vsh.yaml
+++ b/src/eset/create_eset/config.vsh.yaml
@@ -0,0 +1,79 @@
+name: create_eset
+namespace: "eset"
+authors:
+  - __merge__: /src/base/authors/dries_schaumont.yaml
+    roles: [ maintainer ]
+  - __merge__: /src/base/authors/marijke_van_moerbeke.yaml
+    roles: [ author ]
+argument_groups:
+  - name: "Arguments"
+    arguments:
+    - type: file
+      name: "--pDataFile"
+      required: true
+    - type: file
+      name: "--fDataFile"
+      required: true
+    - type: file
+      name: "--mappingDir"
+      multiple: true
+      required: true
+    - type: string
+      name: --poolName
+      required: true
+    - name: "--output"
+      type: file
+      required: true
+      direction: output
+      default: eset.$id.rds
+resources:
+  - type: r_script
+    path: script.R
+test_resources:
+  - type: r_script
+    path: test.R
+  - path: test_data/pData.tsv
+  - path: test_data/fData.tsv
+  - path: test_data/mapping_dir
+engines:
+  - type: docker
+    image: r-base:4.3.0
+    setup:
+      - type: apt
+        packages: # most of these are required for devtools
+          - libcurl4-openssl-dev
+          - libssl-dev
+          - libxml2-dev
+          - libfftw3-dev # Seurat
+          - libfontconfig1-dev # Seurat
+          - libfreetype-dev # Seurat
+          - libhdf5-dev # Seurat
+          - bzip2 # Seurat
+          - libharfbuzz-dev # Seurat
+          - libfribidi-dev # Seurat
+          - libtiff-dev # Seurat
+          - libgsl-dev # Seurat
+          - libcairo-dev # Seurat
+          - libudunits2-dev # SeuratObject
+          - procps
+      - type: r
+        cran:
+          - data.table
+          - BiocManager
+          - remotes
+        bioc:
+          - Biobase
+          - limma  # dependency for nlcv 
+          - a4Core # dependency for nlcv 
+          - MLInterfaces # dependency for nlcv 
+          - multtest # dependency for nlcv
+        cran:
+          - nlcv
+        script: |
+            remotes::install_url("https://cran.r-project.org/src/contrib/Archive/Matrix/Matrix_1.6-5.tar.gz", dependencies=TRUE, upgrade_dependencies=FALSE);\
+            remotes::install_url("https://cran.r-project.org/src/contrib/Archive/Seurat/Seurat_4.4.0.tar.gz", repos=BiocManager::repositories(), dependencies=TRUE, upgrade_dependencies=FALSE)\
+runners:
+  - type: executable
+  - type: nextflow
+
+      
--- a/src/eset/create_eset/script.R
+++ b/src/eset/create_eset/script.R
@@ -0,0 +1,432 @@
+library(Biobase)
+library(data.table)
+library(nlcv)
+library(Matrix)
+library(Seurat)
+
+### VIASH START
+par <- list(
+  pDataFile = "src/eset/create_eset/test_data/pData.tsv",
+  fDataFile = "src/eset/create_eset/test_data/fData.tsv",
+  studyType = "Standard",
+  mappingDir = c("src/eset/create_eset/test_data/mapping_dir/AACAAGGTAC",
+                 "src/eset/create_eset/test_data/mapping_dir/ACGCCTTCGT"),
+  output = "eset.rds",
+  poolName = "Foo"
+)
+### VIASH END
+
+
+Read10X <- function(data_dir = NULL, gene_column = 2, unique_features = TRUE) {
+  full.data <- list()
+  for (i in seq_along(along.with = data_dir)) {
+    run <- data_dir[i]
+    if (!dir.exists(paths = run)) {
+      stop("Directory provided does not exist")
+    }
+    barcode.loc <- file.path(run, "barcodes.tsv")
+    gene.loc <- file.path(run, "features.tsv")
+    features.loc <- file.path(run, "features.tsv.gz")
+    matrix.loc <- file.path(run, "matrix.mtx")
+    pre_ver_3 <- file.exists(gene.loc)
+    if (!pre_ver_3) {
+      addgz <- function(s) {
+        return(paste0(s, ".gz"))
+      }
+      barcode.loc <- addgz(s = barcode.loc)
+      matrix.loc <- addgz(s = matrix.loc)
+    }
+    if (!file.exists(barcode.loc)) {
+      stop("Barcode file missing")
+    }
+    if (!pre_ver_3 && !file.exists(features.loc)) {
+      stop("Gene name or features file missing")
+    }
+    if (!file.exists(matrix.loc)) {
+      stop("Expression matrix file missing")
+    }
+    data <- readMM(file = matrix.loc)
+    cell.names <- readLines(barcode.loc)
+    if (all(grepl(pattern = "\\-1$", x = cell.names))) {
+      cell.names <- as.vector(x = as.character(x = sapply(X = cell.names, 
+                                                          FUN = ExtractField, field = 1, delim = "-")))
+    }
+    if (is.null(x = names(x = data_dir))) {
+      if (i < 2) {
+        colnames(x = data) <- cell.names
+      }
+      else {
+        colnames(x = data) <- paste0(i, "_", cell.names)
+      }
+    }
+    else {
+      colnames(x = data) <- paste0(names(x = data_dir)[i], 
+                                   "_", cell.names)
+    }
+    feature.names <- read.delim(file = ifelse(test = pre_ver_3, 
+                                              yes = gene.loc, no = features.loc), header = FALSE, 
+                                stringsAsFactors = FALSE)
+    if (any(is.na(x = feature.names[, gene_column]))) {
+      warning("Some features names are NA. Replacing NA names with ID from the opposite column requested", 
+              call. = FALSE, immediate. = TRUE)
+      na.features <- which(x = is.na(x = feature.names[, 
+                                                       gene_column]))
+      replacement.column <- ifelse(test = gene_column == 
+                                     2, yes = 1, no = 2)
+      feature.names[na.features, gene_column] <- feature.names[na.features, 
+                                                               replacement.column]
+    }
+    if (unique_features) {
+      fcols = ncol(x = feature.names)
+      if (fcols < gene_column) {
+        stop(paste0("gene_column was set to ", gene_column,
+                    " but feature.tsv.gz (or genes.tsv) only has ",
+                    fcols, " columns.", " Try setting the gene_column ",
+                    "argument to a value <= to ", 
+                    fcols, "."))
+      }
+      rownames(x = data) <- make.unique(names = feature.names[, 
+                                                              gene_column])
+    }
+    if (ncol(x = feature.names) > 2) {
+      data_types <- factor(x = feature.names$V3)
+      lvls <- levels(x = data_types)
+      if (length(x = lvls) > 1 && length(x = full.data) == 0) {
+        message(paste0("10X data contains more than one type and is ",
+                       "being returned as a list containing matrices ",
+                       "of each type."))
+      }
+      expr_name <- "Gene Expression"
+      if (expr_name %in% lvls) {
+        lvls <- c(expr_name, lvls[-which(x = lvls == 
+                                           expr_name)])
+      }
+      data <- lapply(X = lvls, FUN = function(l) {
+        return(data[data_types == l, , drop = FALSE])
+      })
+      names(x = data) <- lvls
+    } else {
+      data <- list(data)
+    }
+    full.data[[length(x = full.data) + 1]] <- data
+  }
+  list_of_data <- list()
+  for (j in 1:length(x = full.data[[1]])) {
+    list_of_data[[j]] <- do.call(cbind, lapply(X = full.data, 
+                                               FUN = `[[`, j))
+    list_of_data[[j]] <- as(object = list_of_data[[j]], Class = "CsparseMatrix")
+  }
+  names(x = list_of_data) <- names(x = full.data[[1]])
+  if (length(x = list_of_data) == 1) {
+    return(list_of_data[[1]])
+  } else {
+    return(list_of_data)
+  }
+}
+
+match_features <- function(exprs_matrix, fdata) {
+
+  identical_features <- all(rownames(exprs_matrix) == rownames(fdata))
+
+  if (nrow(exprs_matrix) != nrow(fdata) || !identical_features) {
+    message(paste0("Features in 'fData' and expression matrix differ. ",
+                   "Only matching features are returned."))
+  }
+
+  features <- intersect(rownames(exprs_matrix), rownames(fdata))
+  exprs_matrix <- exprs_matrix[which(rownames(exprs_matrix) %in% features), ]
+  fdata <- fdata[which(rownames(fdata) %in% features), ]
+
+  fdata[, seq_len(ncol(fdata))] <- lapply(fdata[, seq_len(ncol(fdata)), drop = FALSE], as.character)
+  # order features in exprs mat according to fdata
+  exprs_matrix <- exprs_matrix[match(rownames(fdata), rownames(exprs_matrix)), ]
+
+  list(exprs_matrix = exprs_matrix, fdata = fdata)
+
+}
+
+
+create_pdata <- function(sample_file, pool_name, barcodes) {
+  cols_to_remove <- c("SampleFileName", "Output", "Measure", "Strandedness")
+  pData <- sample_file[, !colnames(sample_file) %in% cols_to_remove,
+                       drop = FALSE]
+  rownames(pData) <- lapply(sample_file$WellBC,
+                            \(x) paste(pool_name, x, sep = "_"))
+  # pData[, ] <- lapply(pData, as.factor)
+  pData$PoolName <- pool_name
+  pData <- pData[match(barcodes, pData$WellBC), ]
+  return(pData)
+}
+
+check_sample_file <- function(mapping_dir, sample_file){
+
+  message("Checking sample annotation:")
+
+  requireNamespace("tools")
+  mapping_dir <- unlist(lapply(mapping_dir, function(x) {
+    if (!dir.exists(x)) {
+      stop(sprintf(paste0("Could not find directory ",
+                          "provided in 'mappingDir' argument (%s)."), x))
+    }
+    tools::file_path_as_absolute(x)
+  }))
+
+
+  # additional check for STARsolo
+  check_STARsolo_output <- function(x) {
+    files <- c("barcodes.tsv", "features.tsv", "matrix.mtx")
+    test <- list.files(x) %in% c(files, paste0(files, ".gz"))
+    length(test) != 0 && all(test)
+  }
+
+
+  if (!"WellBC" %in% colnames(sample_file)) {
+    stop(paste0("STARsolo output is used. The sample annotation must ",
+                "contain 'WellBC' column providing cell barcodes."))
+  }
+
+  mapping_dir <- unique(mapping_dir)
+  all_STARsolo_files_present <- all(
+    unlist(
+      lapply(mapping_dir, function(x) {
+        check_STARsolo_output(x)
+      })
+    )
+  )
+  if (!all_STARsolo_files_present) {
+    stop(paste0("Could not find files: 'barcodes', 'features' and 'matrix'",
+                " for STARsolo output. Please check 'mappingDir' argument."))
+  }
+
+  message("- 'SampleFileName' column - OK")
+
+
+
+  list(sample_expression_files = mapping_dir)
+}
+
+create_exprs_matrix <- function(exprs_matrix_path, exprs_file_paths,
+                                output, measure, col_names, cell_barcodes) {
+
+  read_matrix <- Read10X(data_dir = exprs_file_paths, gene_column = 1)
+  read_matrix <- read_matrix[, which(colSums(read_matrix) != 0)]
+  # keep index of feature names containing "_" because Seurat
+  #changes them to "-" and they no longer match with fdata[, "gene_id"]
+  idx <- grep("_", rownames(read_matrix))
+
+  requireNamespace("Seurat")
+  seurat_object <- Seurat::CreateSeuratObject(counts = read_matrix)
+
+  exprs_matrix <- as.matrix(seurat_object[['RNA']]@counts)
+  # replace "-" with "_" for features with "_" 
+  # before converting to Seurat object
+  rownames(exprs_matrix)[idx] <- gsub("-", "_", rownames(exprs_matrix)[idx])
+  requireNamespace("stringr")
+  exprs_matrix <- exprs_matrix[, stringr::str_detect(colnames(exprs_matrix),
+                                  paste(cell_barcodes, collapse = "|"))]
+
+
+  # check if rownames are ENSEMBL and remove version suffix
+  isENSEMBL <- all(grepl("ENS", rownames(exprs_matrix)))
+  if (isENSEMBL) {
+    # do not use gsub("(.+)[.]\\d+", "\\1", rownames(exprs_matrix)),
+    # so that ENS000000.1_PAR_Y can be kept
+    rownames(exprs_matrix) <- gsub("\\.\\d+$", "", rownames(exprs_matrix))
+  }
+
+
+  colnames(exprs_matrix) <- col_names
+
+  exprs_matrix
+}
+
+create_eset <- function(feature_annotation_path,
+                        sample_annotation_path,
+                        mapping_dir,
+                        barcodes,
+                        output_path,
+                        pool_name,
+                        exprs_matrix_path = NULL,
+                        path = NULL,
+                        add_eset_annotation = NULL) {
+  if (!file.exists(feature_annotation_path)) {
+    stop("Could not find feature annotation at '", feature_annotation_path, "'")
+  }
+
+  if (!file.exists(sample_annotation_path)) {
+    stop("Could not find sample annotation at '", sample_annotation_path, "'")
+  }
+
+  if(!is.null(exprs_matrix_path)) {
+    if(!file.exists(exprs_matrix_path)) {
+      stop("Could not find expression matrix at '", exprs_matrix_path, "'")
+    }
+  }
+
+  if(!is.null(path)) {
+    if(!dir.exists(path)) {
+      stop("Provided 'path': '", path, "' does not exist.")
+    }
+  }
+
+  ##### Import annotation files #####
+  message("Importing feature annotation")
+  fdata_file <- read.table(feature_annotation_path, header = TRUE,
+                           sep = "\t", quote = "\"",
+                           comment.char = "", stringsAsFactors = FALSE)
+
+  # for backwards compatibility
+  if("ENSEMBL" %in% colnames(fdata_file) && !all(grepl("ENS", fdata_file[, "ENSEMBL"])) & !"gene_id" %in% colnames(fdata_file)) {
+    colnames(fdata_file)[which(colnames(fdata_file) == "ENSEMBL")] <- "gene_id"
+  }
+
+  # Check gene annotation
+  if(!"gene_id" %in% colnames(fdata_file))
+    stop("'gene_id' column with unique feature identifiers must be present in 'feature_annotation_path'.")
+
+  # check if duplicated ids are present
+  if(any(duplicated(fdata_file$gene_id)))
+    stop("Duplicated features ids are not allowed. Please check the 'gene_id' column in 'feature_annotation_path'.")
+
+  message("Importing sample annotation")
+  sample_file <- read.table(sample_annotation_path, header = TRUE,
+                            sep = "\t", quote = "\"",
+                            comment.char = "", stringsAsFactors = FALSE)
+  # Check sample annotation
+  check_sample_file_list <- check_sample_file(mapping_dir = mapping_dir,
+                                              sample_file = sample_file)
+  output <- "STARsolo"
+  measure <- "counts"
+  sample_expression_files <- check_sample_file_list$sample_expression_files
+
+  ##### Create phenodata #####
+  pdata_eset <- create_pdata(sample_file = sample_file, pool_name = pool_name,
+                             barcodes = barcodes)
+
+  ##### Create expression matrix #####
+  message("Creating expression matrix")
+
+  exprs_matrix_eset <- create_exprs_matrix(
+    exprs_matrix_path = exprs_matrix_path,
+    exprs_file_paths = sample_expression_files,
+    output = output,
+    measure = measure,
+    col_names = rownames(pdata_eset),
+    cell_barcodes = barcodes
+  )
+
+
+  ##### Create featuredata #####
+  message("Creating feature data")
+
+  fdata_eset <- fdata_file
+  rownames(fdata_eset) <- fdata_eset[, "gene_id"]
+
+  # intersect features between exprs matrix and fdata
+  feature_files <- match_features(exprs_matrix = exprs_matrix_eset,
+                                  fdata = fdata_eset)
+
+  fdata_eset <- feature_files$fdata
+  exprs_matrix_eset <- feature_files$exprs_matrix
+
+  ##### Create eSet #####
+  message("Creating eset")
+
+  if (nrow(pdata_eset) != ncol(exprs_matrix_eset)) {
+    stop("nrow(pData) and ncol(exprsMatrix) differ")
+  }
+
+  if (nrow(fdata_eset) != nrow(exprs_matrix_eset)) {
+    stop("nrow(fData) and nrow(exprsMatrix) differ")
+  }
+
+  if (!all(rownames(pdata_eset) == colnames(exprs_matrix_eset))) {
+    stop("rownames(pData) and colnames(exprsMatrix) differ")
+  }
+
+  if (!all(rownames(fdata_eset) == rownames(exprs_matrix_eset))) {
+    stop("rownames(fData) and rownames(exprsMatrix) differ")
+  }
+
+  if (!inherits(exprs_matrix_eset, "matrix")) {
+    stop("exprsMatrix must be of class 'matrix'")
+  }
+
+
+
+  additional_info <- paste0("Additional information about eSet \n",
+                            "  Expression matrix created from ",
+                            output, " output. \n",
+                            "  Expression matrix contains non-transformed ",
+                            ifelse(output %in% c("STAR", "STARsolo"),
+                                   "counts",
+                                   ifelse(measure == "expected_count",
+                                          "counts", measure)), ".")
+
+
+  if (isTRUE(!is.null(add_eset_annotation) &
+               is.character(add_eset_annotation))) {
+    additional_info <- paste0(additional_info, "\n", "  ", add_eset_annotation)
+  }
+
+  fdata_eset <- new("AnnotatedDataFrame", data = fdata_eset)
+  pdata_eset <- new("AnnotatedDataFrame", data = pdata_eset)
+
+  requireNamespace("Biobase")
+  eset <- Biobase::ExpressionSet(assayData = exprs_matrix_eset,
+                                  phenoData = pdata_eset,
+                                  featureData = fdata_eset,
+                                  annotation = additional_info)
+
+
+  saveRDS(eset, file = output_path)
+
+  message(paste0("eset created succesfully for ", ncol(eset),
+                 " samples and ", nrow(eset),
+                 " genes and saved at ", output_path, ".")) 
+
+  eset
+}
+
+
+p_data_file <- par$pDataFile
+f_data_file <- par$fDataFile
+pool_name <- par$poolName
+mapping_dir <- lapply(par$mappingDir,
+                      \(x) file.path(x, "Solo.out", "Gene", "raw"))
+
+get_barcode_from_mapping_dir <- function(raw_dir) {
+  barcodes_file <- file.path(raw_dir, "barcodes.tsv")
+  if (!file.exists(barcodes_file)) {
+    stop(paste0("Expected the 'Solo.out/Gene/raw' directory at ",
+                raw_dir, " to contain a 'barcodes.tsv' file."))
+  }
+  barcodes <- readLines(barcodes_file)
+  if (length(barcodes) != 1) {
+    stop(paste0("A single STAR Solo folder should only have ",
+                "mapped one (1) barcode, but found '",
+                length(barcodes), "'for mapping directory ", raw_dir))
+  }
+  return(barcodes)
+}
+
+barcodes <- lapply(mapping_dir, get_barcode_from_mapping_dir)
+
+print(paste0("mappingDir: ", mapping_dir))
+print(paste0("pDataFile: ", p_data_file))
+print(paste0("fDataFile: ", f_data_file))
+print(paste0("poolName: ", pool_name))
+print(paste0("barcodes: ", barcodes))
+
+
+
+# CREATE ESET WITH RAW UMI COUNTS
+
+eset <- create_eset(feature_annotation_path = f_data_file,
+                    sample_annotation_path = p_data_file,
+                    mapping_dir = mapping_dir,
+                    barcodes = barcodes,
+                    output_path = par$output,
+                    pool_name = pool_name,
+                    path = NULL,
+                    exprs_matrix_path = NULL)
--- a/src/eset/create_eset/test.R
+++ b/src/eset/create_eset/test.R
@@ -0,0 +1,76 @@
+library(testthat)
+library(Biobase)
+
+### VIASH START
+meta <- list(
+  resources_dir = "src/eset/create_eset/test_data",
+  executable = "target/executable/eset/create_eset/create_eset"
+)
+
+### VIASH END
+
+output <- tempfile()
+
+out <- processx::run(meta$executable, c(
+  "--pDataFile", file.path(meta$resources_dir, "pData.tsv"),
+  "--fDataFile", file.path(meta$resources_dir, "fData.tsv"),
+  "--mappingDir", file.path(meta$resources_dir, "mapping_dir", "AACAAGGTAC"),
+  "--mappingDir", file.path(meta$resources_dir, "mapping_dir", "ACGCCTTCGT"),
+  "--poolName", "foo",
+  "--output", output
+))
+expect_equal(out$status, 0)
+expect_true(file.exists(output))
+result <- readRDS(output)
+stopifnot(length(sampleNames(result)) == 2)
+stopifnot(all(sampleNames(result) == c("foo_AACAAGGTAC", "foo_ACGCCTTCGT")))
+expected_feature_names <- c(
+    "ENS0001058", "ENS0000221", "ENS0001387", "ENS0000508", "ENS0001199",
+    "ENS0000477", "ENS0001457", "ENS0001040", "ENS0000114", "ENS0000821",
+    "ENS0001429", "ENS0001396", "ENS0000355", "ENS0000122", "ENS0000441",
+    "ENS0001223", "ENS0001431", "ENS0000042", "ENS0000443", "ENS0000389",
+    "ENS0001208", "ENS0001140", "ENS0000071", "ENS0001369"
+)
+
+stopifnot(length(featureNames(result)) == 24)
+stopifnot(all(featureNames(result) == expected_feature_names))
+expected_expressions <- matrix(
+    c(0, 0,
+      0, 40,
+      0, 0,
+      0, 0,
+      1, 2,
+      0, 0,
+      0, 0,
+      0, 0,
+      2, 2,
+      0, 0,
+      0, 0,
+      8, 2,
+      0, 0,
+      1, 0,
+      2, 3,
+      0, 0,
+      0, 0,
+      0, 0,
+      1, 0,
+      0, 0,
+      16, 13,
+      0, 0,
+      12, 13,
+      5, 2),
+    ncol = 2,
+    nrow = 24,
+    byrow = TRUE,
+)
+rownames(expected_expressions) <- expected_feature_names
+colnames(expected_expressions) <- c("foo_AACAAGGTAC", "foo_ACGCCTTCGT")
+stopifnot(identical(exprs(result), expected_expressions))
+
+input_f_data <- read.table(file.path(meta$resources_dir, "fData.tsv"),
+                           sep = "\t", quote = "\"", comment.char = "",
+                           header = TRUE)
+input_f_data <- input_f_data[input_f_data$gene_id %in% expected_feature_names, ]
+row.names(input_f_data) <- input_f_data$gene_id
+input_f_data[] <- lapply(input_f_data, as.character)
+stopifnot(identical(input_f_data, fData(result)))
--- a/src/eset/create_eset/test_data/fData.tsv
+++ b/src/eset/create_eset/test_data/fData.tsv
--- a/src/eset/create_eset/test_data/mapping_dir/AACAAGGTAC/Solo.out/Gene/raw/barcodes.tsv
+++ b/src/eset/create_eset/test_data/mapping_dir/AACAAGGTAC/Solo.out/Gene/raw/barcodes.tsv
@@ -0,0 +1 @@
+AACAAGGTAC
--- a/src/eset/create_eset/test_data/mapping_dir/AACAAGGTAC/Solo.out/Gene/raw/features.tsv
+++ b/src/eset/create_eset/test_data/mapping_dir/AACAAGGTAC/Solo.out/Gene/raw/features.tsv
@@ -0,0 +1,25 @@
+ENS0001140	209E3	Gene Expression
+ENS0001058	A2B9A	Gene Expression
+ENS0000508	CF168	Gene Expression
+ENS0001457	3BA5A	Gene Expression
+ENS0001431	1C968	Gene Expression
+ENS0000821	E5192	Gene Expression
+ENS0001040	1821B	Gene Expression
+ENS0000443	5AD11	Gene Expression
+ENS0000441	3F0FF	Gene Expression
+ENS0001387	265F2	Gene Expression
+ENS0001223	28A43	Gene Expression
+ENS0001208	58E28	Gene Expression
+ENS0001396	6E614	Gene Expression
+ENS0001199	EA941	Gene Expression
+ENS0001369	99DDC	Gene Expression
+ENS0000770	AFCC0	Gene Expression
+ENS0000389	B58E5	Gene Expression
+ENS0000071	7A6C3	Gene Expression
+ENS0000114	65424	Gene Expression
+ENS0000355	077A2	Gene Expression
+ENS0001429	22A4F	Gene Expression
+ENS0000477	981E6	Gene Expression
+ENS0000042	E2D99	Gene Expression
+ENS0000122	D90E9	Gene Expression
+ENS0000221	97B0F	Gene Expression
--- a/src/eset/create_eset/test_data/mapping_dir/AACAAGGTAC/Solo.out/Gene/raw/matrix.mtx
+++ b/src/eset/create_eset/test_data/mapping_dir/AACAAGGTAC/Solo.out/Gene/raw/matrix.mtx
@@ -0,0 +1,13 @@
+%%MatrixMarket matrix coordinate integer general
+%
+25 1 10
+8 1 1
+9 1 2
+12 1 16
+13 1 8
+14 1 1
+15 1 5
+16 1 5
+18 1 12
+19 1 2
+24 1 1
--- a/src/eset/create_eset/test_data/mapping_dir/ACGCCTTCGT/Solo.out/Gene/raw/barcodes.tsv
+++ b/src/eset/create_eset/test_data/mapping_dir/ACGCCTTCGT/Solo.out/Gene/raw/barcodes.tsv
@@ -0,0 +1 @@
+ACGCCTTCGT
--- a/src/eset/create_eset/test_data/mapping_dir/ACGCCTTCGT/Solo.out/Gene/raw/features.tsv
+++ b/src/eset/create_eset/test_data/mapping_dir/ACGCCTTCGT/Solo.out/Gene/raw/features.tsv
@@ -0,0 +1,25 @@
+ENS0001140	209E3	Gene Expression
+ENS0001058	A2B9A	Gene Expression
+ENS0000508	CF168	Gene Expression
+ENS0001457	3BA5A	Gene Expression
+ENS0001431	1C968	Gene Expression
+ENS0000821	E5192	Gene Expression
+ENS0001040	1821B	Gene Expression
+ENS0000443	5AD11	Gene Expression
+ENS0000441	3F0FF	Gene Expression
+ENS0001387	265F2	Gene Expression
+ENS0001223	28A43	Gene Expression
+ENS0001208	58E28	Gene Expression
+ENS0001396	6E614	Gene Expression
+ENS0001199	EA941	Gene Expression
+ENS0001369	99DDC	Gene Expression
+ENS0000770	AFCC0	Gene Expression
+ENS0000389	B58E5	Gene Expression
+ENS0000071	7A6C3	Gene Expression
+ENS0000114	65424	Gene Expression
+ENS0000355	077A2	Gene Expression
+ENS0001429	22A4F	Gene Expression
+ENS0000477	981E6	Gene Expression
+ENS0000042	E2D99	Gene Expression
+ENS0000122	D90E9	Gene Expression
+ENS0000221	97B0F	Gene Expression
--- a/src/eset/create_eset/test_data/mapping_dir/ACGCCTTCGT/Solo.out/Gene/raw/matrix.mtx
+++ b/src/eset/create_eset/test_data/mapping_dir/ACGCCTTCGT/Solo.out/Gene/raw/matrix.mtx
@@ -0,0 +1,12 @@
+%%MatrixMarket matrix coordinate integer general
+%
+25 1 9
+9 1 3
+12 1 13
+13 1 2
+14 1 2
+15 1 2
+16 1 3
+18 1 13
+19 1 2
+25 1 40
--- a/src/eset/create_eset/test_data/pData.tsv
+++ b/src/eset/create_eset/test_data/pData.tsv
@@ -0,0 +1,3 @@
+WellBC	WellID	NumberOfMTReads	pctMT	NumberOfERCCReads	pctERCC	NumberOfChromReads	pctChrom	NumberOfInputReads	NumberOfMappedReads	PctMappedReads	NumberOfReadsMappedToMultipleLoci	PectOfReadsMappedToMultipleLoci	NumberOfReadsMappedToTooManyLoci	PectOfReadsMappedToTooManyLoci	NumberOfReadsUnmappedTooManyMismatches	PectOfReadsUnmappedTooManyMismatches	NumberOfReadsUnmappedTooShort	PectOfReadsUnmappedTooShort	NumberOfReadsUnmappedOther	PectOfReadsUnmappedOther	ReadsWithValidBarcodes	SequencingSaturation	Q30BasesInCB+UMI	ReadsMappedToTranscriptome:Unique+MultipeGenes	EstimatedNumberOfCells	FractionOfReadsInCells	MeanReadsPerCell	NumberOfUMIs	NumberOfGenes	NumberOfCountedReads
+AACAAGGTAC	A1	0	0	0	0	8542	100	141303	23749	16.81	0	0	8458	5.99	0	0	109035	77.16	61	0.04	0.999816	0.0698056	0.979965	0.0618175	1	1	8538	7942	408	9535
+ACGCCTTCGT	B2	0	0	0	0	5863	100	96430	16869	17.49	0	0	6124	6.35	0	0	73375	76.09	62	0.06	0.999782	0.0665302	0.980077	0.0620969	1	1	5862	5472	377	6463
--- a/src/eset/create_fdata/config.vsh.yaml
+++ b/src/eset/create_fdata/config.vsh.yaml
@@ -0,0 +1,46 @@
+name: create_fdata
+namespace: eset
+description: |
+  Create a fdata file
+authors:
+  - __merge__: /src/base/authors/dries_schaumont.yaml
+    roles: [ maintainer ]
+  - __merge__: /src/base/authors/marijke_van_moerbeke.yaml
+    roles: [ contributor ]
+arguments:
+- name: "--gtf"
+  type: file
+  description: "Genome annotation file in GTF format."
+  required: true
+- name: "--output"
+  description: |
+    Tab-delimited text file containing information about the 'gene' or 'transcript'
+    entries from the input GTF file. The 'transcript' entries are used in case the source
+    of the GTF was 'refGene' or 'ncbiRefSeq'. 
+  type: file
+  direction: output
+  default: fData.$id.txt
+resources:
+- type: python_script
+  path: create_fdata.py
+test_resources:
+- type: python_script
+  path: test.py
+- path: test_annotation.gtf
+engines:
+  - type: docker
+    image: python:3.12-slim
+    setup:
+      - type: apt
+        packages:
+          - procps
+      - type: python
+        packages:
+          - pandas
+    test_setup:
+      - type: python
+        packages:
+          - viashpy
+runners:
+  - type: executable
+  - type: nextflow
--- a/src/eset/create_fdata/create_fdata.py
+++ b/src/eset/create_fdata/create_fdata.py
@@ -0,0 +1,130 @@
+import logging
+import pandas as pd
+import numpy as np
+from textwrap import fill
+
+
+### VIASH START
+meta = {
+    "name": "create_fdata",
+}
+
+par = {
+  "gtf": "src/eset/create_fdata/test_annotation.gtf",
+  "output": "fData.tsv"
+}
+
+### VIASH END
+
+logger = logging.getLogger()
+console_handler = logging.StreamHandler()
+logger.addHandler(console_handler)
+logger.setLevel(logging.DEBUG)
+
+
+def read_gtf(gtf_path: str) -> pd.DataFrame:
+    logger.info("Reading %s", gtf_path)
+    result = pd.read_csv(gtf_path, sep="\t",
+                         header=None, names=("seqname", "source",
+                                             "feature", "start", "end",
+                                             "score", "strand", "frame",
+                                             "attribute"),
+                         dtype={
+                            "seqname": pd.StringDtype(),
+                            "source": pd.StringDtype(),
+                            "feature": pd.StringDtype(),
+                            "start": pd.Int64Dtype(),
+                            "end": pd.Int64Dtype(),
+                            "score": pd.StringDtype(),
+                            "strand": pd.CategoricalDtype(categories=["+", "-"],
+                                                            ordered=False),
+                            "frame": pd.StringDtype(),
+                            "attribute": pd.StringDtype(),
+                          },
+                          comment='#'
+                        )
+    logger.info("Done reading %s. Found %d GTF entries ", par["gtf"], result.shape[0])
+    logger.info("GTF file is providing information for the following chromosomes: \n%s", 
+                fill(", ".join(result['seqname'].unique()), width=100))
+    logger.info("The following sources were specified in the GTF file:\n%s",
+                ", ".join(result["source"].unique()))
+    return result
+    
+
+def parse_attributes(attributes_series: pd.Series):
+    attribute_dict = dict()
+    attributes_list = [attr.strip().split(" ")
+                       for attr in attributes_series["attribute"].strip(";").split(";")]
+    for (attr_name, attr_value) in attributes_list:
+        attribute_dict.setdefault(attr_name, []).append(attr_value.strip('"'))
+    attribute_dict = {attr_name: "|".join(attr_value) 
+                      for attr_name, attr_value in attribute_dict.items()}
+    return pd.Series(attribute_dict)
+    
+
+def main(par):
+    logger.info(f"{meta['name']} started.")
+    parameters_str = [f'\t{param}: {param_val}\n' for param, param_val in par.items()]
+    logger.info("Parameters:\n%s", "".join(parameters_str).rstrip())
+    gtf_file = read_gtf(par["gtf"])
+    sources = set(source for source in gtf_file["source"].unique() if source != "ERCC")
+    specific_gtf = False
+    feature = "gene"
+    if len(sources) == 1 and (source := sources[0]) \
+        and (source == "refGene" or source == "ncbiRefSeq"):
+        feature = "transcript"
+        specific_gtf = True
+        logger.info("Found specific GTF from %s, forcing filtering on feature type %s", source, feature)
+    logger.info("Filtering GTF entries for feature type '%s'.", feature)
+    gtf_file = gtf_file[gtf_file["feature"] == feature]
+    logger.info("After filtering %d entries are left.", gtf_file.shape[0])
+    logger.info("Parsing the GTF attributes")
+    annotation = gtf_file[["attribute"]].apply(parse_attributes, result_type="expand", axis=1)
+    logger.info("Found the following attributes in the GTF:\n%s", ", ".join(annotation.columns))
+    annotation = pd.concat([gtf_file.drop(["attribute"], axis=1), annotation], axis=1)
+    if specific_gtf:
+       logger.info("Because the source of the GTF is either 'ncbiRefSeq' or 'refGene', which"
+                   "caused forced filtering based on %s, the duplicate genes still need to be dropped.",
+                   feature)
+       annotation = annotation.drop_duplicates(subset=("gene_id", "gene_name"), keep=False)
+       logger.info("After dropping duplicates, %d entries are left", annotation.shape[0])
+
+    # detect ensembl ids
+    # some GTF files contain version in ENSEMBL, e.g. ENS00000000046319.1
+    # we remove the version, because the annotation packages don't contain the version
+    if "gene_id" in annotation.columns:
+        logger.info("'gene_id' column was detected in attributes. Performing extra parsing of ENSEMBL ids.")
+        annotation["ENSEMBL_with_version"] = annotation["gene_id"].where(annotation["gene_id"].str.startswith("ENS"))
+        annotation["ENSEMBL"] = annotation["ENSEMBL_with_version"].str.replace(r"\.\d+$", "", regex=True)
+        annotation["gene_id"] = annotation["gene_id"].str.replace(r"\.\d+$", "", regex=True)
+
+    possible_name_columns = ("Name", "name", "gene_name")
+    found_columns = list(filter(lambda col_name: col_name in annotation, possible_name_columns))
+    # The following code allows to select a value for the SYMBOL column based on the first non-na column
+    if found_columns:
+        logger.info("Found one the following columns: %s; which can be used to populate the SYMBOL column",
+                    ", ".join(possible_name_columns))
+        # For each row (gtf entry), get the name of the first column that actually holds a value.
+        column_to_get = annotation.loc[:,found_columns].apply(pd.Series.first_valid_index, axis=1)
+        counts_per_column = column_to_get.value_counts(dropna=False).to_dict()
+        counts_per_column_str = [f'\t{col}: {counts}\n' for col, counts in counts_per_column.items()]
+        logger.info("Frequencies of the origin for the entries in the SYMBOL column:\n%s",
+                    "".join(counts_per_column_str).rstrip())
+        # If all columns hold NA for a certain row, first_valid_index will return None.
+        # Just use the name of the first column.
+        column_to_get = column_to_get.fillna(found_columns[0])
+        # We now have a list one column name per row, use it so select the values
+        # Loc cannot be used here because 1 value per row is required, 
+        # and loc will select for each row all the columns in columns_to_get
+        idx, cols = pd.factorize(column_to_get)
+        symbol_values = annotation.reindex(cols, axis=1).to_numpy()[np.arange(len(annotation)), idx]
+        annotation["SYMBOL"] = symbol_values
+
+    logger.info("Writing to %s", par["output"])
+    annotation = annotation.drop(["score", "source", "frame", "feature"], axis=1)
+    annotation.to_csv(par["output"], sep="\t", header=True, index=False, na_rep="NA")
+    logger.info("%s finished", meta['name'])
+
+
+if __name__ == "__main__":
+    main(par)
--- a/src/eset/create_fdata/test.py
+++ b/src/eset/create_fdata/test.py
@@ -0,0 +1,61 @@
+import pytest
+import sys
+import pandas as pd
+from pathlib import Path
+from uuid import uuid4
+
+### VIASH START
+meta = {
+    "resources_dir": "./src/eset/create_fdata/",
+    "executable": "target/executable/eset/create_fdata/create_fdata",
+    "config": "src/eset/create_fdata/config.vsh.yaml"
+}
+### VIASH END
+
+@pytest.fixture
+def test_annotation_path():
+    return Path(meta["resources_dir"]) / "test_annotation.gtf"
+
+
+@pytest.fixture
+def random_path(tmp_path):
+    def wrapper(extension=None):
+        extension = "" if not extension else f".{extension}"
+        return tmp_path / f"{uuid4()}{extension}"
+    return wrapper 
+
+
+def test_create_fdata(run_component, test_annotation_path, random_path):
+    output_path = random_path("tsv")
+    run_component([
+        "--gtf", test_annotation_path,
+        "--output", output_path
+    ])
+    assert output_path.is_file()
+    result = pd.read_csv(output_path, sep="\t", dtype=pd.StringDtype())
+
+    expected_dict = {
+        "seqname": ["20", "20", "20", "21"],
+        "start": ["87250", "142590", "157454", "297570"],
+        "end": ["97094", "145751", "159163", "300321"],
+        "strand": ["+", "+", "+", "+"],
+        "gene_id": ["ENSG00000178591", "ENSG00000125788",
+                    "ENSG00000088782", "ENSG00000247315"],
+        "gene_version": ["7", "6", "5", "4"],
+        "gene_name": ["DEFB125", "DEFB126", "DEFB127", pd.NA],
+        "gene_source": ["ensembl_havana", "ensembl_havana",
+                        "ensembl_havana", "havana"],
+        "gene_biotype": ["protein_coding", "protein_coding",
+                         "protein_coding", "protein_coding"],
+        "ENSEMBL_with_version": ["ENSG00000178591.7", "ENSG00000125788",
+                                 "ENSG00000088782", "ENSG00000247315"],
+        "ENSEMBL": ["ENSG00000178591", "ENSG00000125788",
+                    "ENSG00000088782", "ENSG00000247315"],
+        "SYMBOL": ["DEFB125", "DEFB126", "DEFB127", pd.NA]
+    }
+    expected = pd.DataFrame.from_dict(expected_dict, dtype=pd.StringDtype())
+    pd.testing.assert_frame_equal(expected, result, check_like=True)
+
+
+if __name__ == '__main__':
+    sys.exit(pytest.main([__file__]))
--- a/src/eset/create_fdata/test_annotation.gtf
+++ b/src/eset/create_fdata/test_annotation.gtf
@@ -0,0 +1,45 @@
+20	ensembl_havana	gene	87250	97094	.	+	.	gene_id "ENSG00000178591.7"; gene_version "7"; gene_name "DEFB125"; gene_source "ensembl_havana"; gene_biotype "protein_coding";
+20	havana	transcript	87250	97094	.	+	.	gene_id "ENSG00000178591"; gene_version "7"; transcript_id "ENST00000608838"; transcript_version "1"; gene_name "DEFB125"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "DEFB125-202"; transcript_source "havana"; transcript_biotype "processed_transcript"; transcript_support_level "2";
+20	havana	exon	87250	87359	.	+	.	gene_id "ENSG00000178591"; gene_version "7"; transcript_id "ENST00000608838"; transcript_version "1"; exon_number "1"; gene_name "DEFB125"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "DEFB125-202"; transcript_source "havana"; transcript_biotype "processed_transcript"; exon_id "ENSE00003702629"; exon_version "1"; transcript_support_level "2";
+20	havana	exon	96005	97094	.	+	.	gene_id "ENSG00000178591"; gene_version "7"; transcript_id "ENST00000608838"; transcript_version "1"; exon_number "2"; gene_name "DEFB125"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "DEFB125-202"; transcript_source "havana"; transcript_biotype "processed_transcript"; exon_id "ENSE00003705060"; exon_version "1"; transcript_support_level "2";
+20	ensembl_havana	transcript	87672	97094	.	+	.	gene_id "ENSG00000178591"; gene_version "7"; transcript_id "ENST00000382410"; transcript_version "3"; gene_name "DEFB125"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "DEFB125-201"; transcript_source "ensembl_havana"; transcript_biotype "protein_coding"; tag "CCDS"; ccds_id "CCDS12989"; tag "basic"; transcript_support_level "1 (assigned to previous version 2)";
+20	ensembl_havana	exon	87672	87767	.	+	.	gene_id "ENSG00000178591"; gene_version "7"; transcript_id "ENST00000382410"; transcript_version "3"; exon_number "1"; gene_name "DEFB125"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "DEFB125-201"; transcript_source "ensembl_havana"; transcript_biotype "protein_coding"; tag "CCDS"; ccds_id "CCDS12989"; exon_id "ENSE00001491993"; exon_version "2"; tag "basic"; transcript_support_level "1 (assigned to previous version 2)";
+20	ensembl_havana	CDS	87710	87767	.	+	0	gene_id "ENSG00000178591"; gene_version "7"; transcript_id "ENST00000382410"; transcript_version "3"; exon_number "1"; gene_name "DEFB125"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "DEFB125-201"; transcript_source "ensembl_havana"; transcript_biotype "protein_coding"; tag "CCDS"; ccds_id "CCDS12989"; protein_id "ENSP00000371847"; protein_version "2"; tag "basic"; transcript_support_level "1 (assigned to previous version 2)";
+20	ensembl_havana	start_codon	87710	87712	.	+	0	gene_id "ENSG00000178591"; gene_version "7"; transcript_id "ENST00000382410"; transcript_version "3"; exon_number "1"; gene_name "DEFB125"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "DEFB125-201"; transcript_source "ensembl_havana"; transcript_biotype "protein_coding"; tag "CCDS"; ccds_id "CCDS12989"; tag "basic"; transcript_support_level "1 (assigned to previous version 2)";
+20	ensembl_havana	exon	96005	97094	.	+	.	gene_id "ENSG00000178591"; gene_version "7"; transcript_id "ENST00000382410"; transcript_version "3"; exon_number "2"; gene_name "DEFB125"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "DEFB125-201"; transcript_source "ensembl_havana"; transcript_biotype "protein_coding"; tag "CCDS"; ccds_id "CCDS12989"; exon_id "ENSE00001491984"; exon_version "3"; tag "basic"; transcript_support_level "1 (assigned to previous version 2)";
+20	ensembl_havana	CDS	96005	96414	.	+	2	gene_id "ENSG00000178591"; gene_version "7"; transcript_id "ENST00000382410"; transcript_version "3"; exon_number "2"; gene_name "DEFB125"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "DEFB125-201"; transcript_source "ensembl_havana"; transcript_biotype "protein_coding"; tag "CCDS"; ccds_id "CCDS12989"; protein_id "ENSP00000371847"; protein_version "2"; tag "basic"; transcript_support_level "1 (assigned to previous version 2)";
+20	ensembl_havana	stop_codon	96415	96417	.	+	0	gene_id "ENSG00000178591"; gene_version "7"; transcript_id "ENST00000382410"; transcript_version "3"; exon_number "2"; gene_name "DEFB125"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "DEFB125-201"; transcript_source "ensembl_havana"; transcript_biotype "protein_coding"; tag "CCDS"; ccds_id "CCDS12989"; tag "basic"; transcript_support_level "1 (assigned to previous version 2)";
+20	ensembl_havana	five_prime_utr	87672	87709	.	+	.	gene_id "ENSG00000178591"; gene_version "7"; transcript_id "ENST00000382410"; transcript_version "3"; gene_name "DEFB125"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "DEFB125-201"; transcript_source "ensembl_havana"; transcript_biotype "protein_coding"; tag "CCDS"; ccds_id "CCDS12989"; tag "basic"; transcript_support_level "1 (assigned to previous version 2)";
+20	ensembl_havana	three_prime_utr	96418	97094	.	+	.	gene_id "ENSG00000178591"; gene_version "7"; transcript_id "ENST00000382410"; transcript_version "3"; gene_name "DEFB125"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "DEFB125-201"; transcript_source "ensembl_havana"; transcript_biotype "protein_coding"; tag "CCDS"; ccds_id "CCDS12989"; tag "basic"; transcript_support_level "1 (assigned to previous version 2)";
+20	ensembl_havana	gene	142590	145751	.	+	.	gene_id "ENSG00000125788"; gene_version "6"; gene_name "DEFB126"; gene_source "ensembl_havana"; gene_biotype "protein_coding";
+20	ensembl_havana	transcript	142590	145751	.	+	.	gene_id "ENSG00000125788"; gene_version "6"; transcript_id "ENST00000382398"; transcript_version "4"; gene_name "DEFB126"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "DEFB126-201"; transcript_source "ensembl_havana"; transcript_biotype "protein_coding"; tag "CCDS"; ccds_id "CCDS12990"; tag "basic"; transcript_support_level "1 (assigned to previous version 3)";
+20	ensembl_havana	exon	142590	142686	.	+	.	gene_id "ENSG00000125788"; gene_version "6"; transcript_id "ENST00000382398"; transcript_version "4"; exon_number "1"; gene_name "DEFB126"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "DEFB126-201"; transcript_source "ensembl_havana"; transcript_biotype "protein_coding"; tag "CCDS"; ccds_id "CCDS12990"; exon_id "ENSE00001491976"; exon_version "4"; tag "basic"; transcript_support_level "1 (assigned to previous version 3)";
+20	ensembl_havana	CDS	142629	142686	.	+	0	gene_id "ENSG00000125788"; gene_version "6"; transcript_id "ENST00000382398"; transcript_version "4"; exon_number "1"; gene_name "DEFB126"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "DEFB126-201"; transcript_source "ensembl_havana"; transcript_biotype "protein_coding"; tag "CCDS"; ccds_id "CCDS12990"; protein_id "ENSP00000371835"; protein_version "3"; tag "basic"; transcript_support_level "1 (assigned to previous version 3)";
+20	ensembl_havana	start_codon	142629	142631	.	+	0	gene_id "ENSG00000125788"; gene_version "6"; transcript_id "ENST00000382398"; transcript_version "4"; exon_number "1"; gene_name "DEFB126"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "DEFB126-201"; transcript_source "ensembl_havana"; transcript_biotype "protein_coding"; tag "CCDS"; ccds_id "CCDS12990"; tag "basic"; transcript_support_level "1 (assigned to previous version 3)";
+20	ensembl_havana	exon	145415	145751	.	+	.	gene_id "ENSG00000125788"; gene_version "6"; transcript_id "ENST00000382398"; transcript_version "4"; exon_number "2"; gene_name "DEFB126"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "DEFB126-201"; transcript_source "ensembl_havana"; transcript_biotype "protein_coding"; tag "CCDS"; ccds_id "CCDS12990"; exon_id "ENSE00000858522"; exon_version "4"; tag "basic"; transcript_support_level "1 (assigned to previous version 3)";
+20	ensembl_havana	CDS	145415	145689	.	+	2	gene_id "ENSG00000125788"; gene_version "6"; transcript_id "ENST00000382398"; transcript_version "4"; exon_number "2"; gene_name "DEFB126"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "DEFB126-201"; transcript_source "ensembl_havana"; transcript_biotype "protein_coding"; tag "CCDS"; ccds_id "CCDS12990"; protein_id "ENSP00000371835"; protein_version "3"; tag "basic"; transcript_support_level "1 (assigned to previous version 3)";
+20	ensembl_havana	stop_codon	145690	145692	.	+	0	gene_id "ENSG00000125788"; gene_version "6"; transcript_id "ENST00000382398"; transcript_version "4"; exon_number "2"; gene_name "DEFB126"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "DEFB126-201"; transcript_source "ensembl_havana"; transcript_biotype "protein_coding"; tag "CCDS"; ccds_id "CCDS12990"; tag "basic"; transcript_support_level "1 (assigned to previous version 3)";
+20	ensembl_havana	five_prime_utr	142590	142628	.	+	.	gene_id "ENSG00000125788"; gene_version "6"; transcript_id "ENST00000382398"; transcript_version "4"; gene_name "DEFB126"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "DEFB126-201"; transcript_source "ensembl_havana"; transcript_biotype "protein_coding"; tag "CCDS"; ccds_id "CCDS12990"; tag "basic"; transcript_support_level "1 (assigned to previous version 3)";
+20	ensembl_havana	three_prime_utr	145693	145751	.	+	.	gene_id "ENSG00000125788"; gene_version "6"; transcript_id "ENST00000382398"; transcript_version "4"; gene_name "DEFB126"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "DEFB126-201"; transcript_source "ensembl_havana"; transcript_biotype "protein_coding"; tag "CCDS"; ccds_id "CCDS12990"; tag "basic"; transcript_support_level "1 (assigned to previous version 3)";
+20	havana	transcript	142634	145749	.	+	.	gene_id "ENSG00000125788"; gene_version "6"; transcript_id "ENST00000542572"; transcript_version "1"; gene_name "DEFB126"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "DEFB126-202"; transcript_source "havana"; transcript_biotype "processed_transcript"; tag "mRNA_start_NF"; transcript_support_level "3";
+20	havana	exon	142634	142686	.	+	.	gene_id "ENSG00000125788"; gene_version "6"; transcript_id "ENST00000542572"; transcript_version "1"; exon_number "1"; gene_name "DEFB126"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "DEFB126-202"; transcript_source "havana"; transcript_biotype "processed_transcript"; exon_id "ENSE00002285856"; exon_version "1"; tag "mRNA_start_NF"; transcript_support_level "3";
+20	havana	exon	145415	145488	.	+	.	gene_id "ENSG00000125788"; gene_version "6"; transcript_id "ENST00000542572"; transcript_version "1"; exon_number "2"; gene_name "DEFB126"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "DEFB126-202"; transcript_source "havana"; transcript_biotype "processed_transcript"; exon_id "ENSE00002303512"; exon_version "1"; tag "mRNA_start_NF"; transcript_support_level "3";
+20	havana	exon	145579	145749	.	+	.	gene_id "ENSG00000125788"; gene_version "6"; transcript_id "ENST00000542572"; transcript_version "1"; exon_number "3"; gene_name "DEFB126"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "DEFB126-202"; transcript_source "havana"; transcript_biotype "processed_transcript"; exon_id "ENSE00002217818"; exon_version "1"; tag "mRNA_start_NF"; transcript_support_level "3";
+20	ensembl_havana	gene	157454	159163	.	+	.	gene_id "ENSG00000088782"; gene_version "5"; gene_name "DEFB127"; gene_source "ensembl_havana"; gene_biotype "protein_coding";
+20	ensembl_havana	transcript	157454	159163	.	+	.	gene_id "ENSG00000088782"; gene_version "5"; transcript_id "ENST00000382388"; transcript_version "4"; gene_name "DEFB127"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "DEFB127-201"; transcript_source "ensembl_havana"; transcript_biotype "protein_coding"; tag "CCDS"; ccds_id "CCDS12991"; tag "basic"; transcript_support_level "1 (assigned to previous version 3)";
+20	ensembl_havana	exon	157454	157593	.	+	.	gene_id "ENSG00000088782"; gene_version "5"; transcript_id "ENST00000382388"; transcript_version "4"; exon_number "1"; gene_name "DEFB127"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "DEFB127-201"; transcript_source "ensembl_havana"; transcript_biotype "protein_coding"; tag "CCDS"; ccds_id "CCDS12991"; exon_id "ENSE00001491947"; exon_version "4"; tag "basic"; transcript_support_level "1 (assigned to previous version 3)";
+20	ensembl_havana	CDS	157545	157593	.	+	0	gene_id "ENSG00000088782"; gene_version "5"; transcript_id "ENST00000382388"; transcript_version "4"; exon_number "1"; gene_name "DEFB127"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "DEFB127-201"; transcript_source "ensembl_havana"; transcript_biotype "protein_coding"; tag "CCDS"; ccds_id "CCDS12991"; protein_id "ENSP00000371825"; protein_version "3"; tag "basic"; transcript_support_level "1 (assigned to previous version 3)";
+20	ensembl_havana	start_codon	157545	157547	.	+	0	gene_id "ENSG00000088782"; gene_version "5"; transcript_id "ENST00000382388"; transcript_version "4"; exon_number "1"; gene_name "DEFB127"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "DEFB127-201"; transcript_source "ensembl_havana"; transcript_biotype "protein_coding"; tag "CCDS"; ccds_id "CCDS12991"; tag "basic"; transcript_support_level "1 (assigned to previous version 3)";
+20	ensembl_havana	exon	158774	159163	.	+	.	gene_id "ENSG00000088782"; gene_version "5"; transcript_id "ENST00000382388"; transcript_version "4"; exon_number "2"; gene_name "DEFB127"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "DEFB127-201"; transcript_source "ensembl_havana"; transcript_biotype "protein_coding"; tag "CCDS"; ccds_id "CCDS12991"; exon_id "ENSE00001166560"; exon_version "3"; tag "basic"; transcript_support_level "1 (assigned to previous version 3)";
+20	ensembl_havana	CDS	158774	159021	.	+	2	gene_id "ENSG00000088782"; gene_version "5"; transcript_id "ENST00000382388"; transcript_version "4"; exon_number "2"; gene_name "DEFB127"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "DEFB127-201"; transcript_source "ensembl_havana"; transcript_biotype "protein_coding"; tag "CCDS"; ccds_id "CCDS12991"; protein_id "ENSP00000371825"; protein_version "3"; tag "basic"; transcript_support_level "1 (assigned to previous version 3)";
+20	ensembl_havana	stop_codon	159022	159024	.	+	0	gene_id "ENSG00000088782"; gene_version "5"; transcript_id "ENST00000382388"; transcript_version "4"; exon_number "2"; gene_name "DEFB127"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "DEFB127-201"; transcript_source "ensembl_havana"; transcript_biotype "protein_coding"; tag "CCDS"; ccds_id "CCDS12991"; tag "basic"; transcript_support_level "1 (assigned to previous version 3)";
+20	ensembl_havana	five_prime_utr	157454	157544	.	+	.	gene_id "ENSG00000088782"; gene_version "5"; transcript_id "ENST00000382388"; transcript_version "4"; gene_name "DEFB127"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "DEFB127-201"; transcript_source "ensembl_havana"; transcript_biotype "protein_coding"; tag "CCDS"; ccds_id "CCDS12991"; tag "basic"; transcript_support_level "1 (assigned to previous version 3)";
+20	ensembl_havana	three_prime_utr	159025	159163	.	+	.	gene_id "ENSG00000088782"; gene_version "5"; transcript_id "ENST00000382388"; transcript_version "4"; gene_name "DEFB127"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "DEFB127-201"; transcript_source "ensembl_havana"; transcript_biotype "protein_coding"; tag "CCDS"; ccds_id "CCDS12991"; tag "basic"; transcript_support_level "1 (assigned to previous version 3)";
+21	havana	gene	297570	300321	.	+	.	gene_id "ENSG00000247315"; gene_version "4"; gene_source "havana"; gene_biotype "protein_coding";
+21	havana	transcript	297570	300321	.	+	.	gene_id "ENSG00000247315"; gene_version "4"; transcript_id "ENST00000500893"; transcript_version "4"; gene_source "havana"; gene_biotype "protein_coding"; transcript_name "ZCCHC3-201"; transcript_source "havana"; transcript_biotype "protein_coding"; tag "CCDS"; ccds_id "CCDS42844"; tag "basic"; transcript_support_level "NA (assigned to previous version 3)";
+21	havana	exon	297570	300321	.	+	.	gene_id "ENSG00000247315"; gene_version "4"; transcript_id "ENST00000500893"; transcript_version "4"; exon_number "1"; gene_source "havana"; gene_biotype "protein_coding"; transcript_name "ZCCHC3-201"; transcript_source "havana"; transcript_biotype "protein_coding"; tag "CCDS"; ccds_id "CCDS42844"; exon_id "ENSE00001977652"; exon_version "4"; tag "basic"; transcript_support_level "NA (assigned to previous version 3)";
+21	havana	CDS	297587	298795	.	+	0	gene_id "ENSG00000247315"; gene_version "4"; transcript_id "ENST00000500893"; transcript_version "4"; exon_number "1"; gene_source "havana"; gene_biotype "protein_coding"; transcript_name "ZCCHC3-201"; transcript_source "havana"; transcript_biotype "protein_coding"; tag "CCDS"; ccds_id "CCDS42844"; protein_id "ENSP00000484056"; protein_version "1"; tag "basic"; transcript_support_level "NA (assigned to previous version 3)";
+21	havana	start_codon	297587	297589	.	+	0	gene_id "ENSG00000247315"; gene_version "4"; transcript_id "ENST00000500893"; transcript_version "4"; exon_number "1"; gene_source "havana"; gene_biotype "protein_coding"; transcript_name "ZCCHC3-201"; transcript_source "havana"; transcript_biotype "protein_coding"; tag "CCDS"; ccds_id "CCDS42844"; tag "basic"; transcript_support_level "NA (assigned to previous version 3)";
+21	havana	stop_codon	298796	298798	.	+	0	gene_id "ENSG00000247315"; gene_version "4"; transcript_id "ENST00000500893"; transcript_version "4"; exon_number "1"; gene_source "havana"; gene_biotype "protein_coding"; transcript_name "ZCCHC3-201"; transcript_source "havana"; transcript_biotype "protein_coding"; tag "CCDS"; ccds_id "CCDS42844"; tag "basic"; transcript_support_level "NA (assigned to previous version 3)";
+21	havana	five_prime_utr	297570	297586	.	+	.	gene_id "ENSG00000247315"; gene_version "4"; transcript_id "ENST00000500893"; transcript_version "4"; gene_source "havana"; gene_biotype "protein_coding"; transcript_name "ZCCHC3-201"; transcript_source "havana"; transcript_biotype "protein_coding"; tag "CCDS"; ccds_id "CCDS42844"; tag "basic"; transcript_support_level "NA (assigned to previous version 3)";
+21	havana	three_prime_utr	298799	300321	.	+	.	gene_id "ENSG00000247315"; gene_version "4"; transcript_id "ENST00000500893"; transcript_version "4"; gene_source "havana"; gene_biotype "protein_coding"; transcript_name "ZCCHC3-201"; transcript_source "havana"; transcript_biotype "protein_coding"; tag "CCDS"; ccds_id "CCDS42844"; tag "basic"; transcript_support_level "NA (assigned to previous version 3)";
--- a/src/eset/create_pdata/config.vsh.yaml
+++ b/src/eset/create_pdata/config.vsh.yaml
@@ -0,0 +1,55 @@
+name: create_pdata
+namespace: eset
+description: |
+  Create a pdata file by combining the mapping statistics 
+authors:
+  - __merge__: /src/base/authors/dries_schaumont.yaml
+    roles: [ maintainer ]
+  - __merge__: /src/base/authors/marijke_van_moerbeke.yaml
+    roles: [ contributor ]
+arguments:
+- name: "--star_stats_file"
+  type: file
+  description: |
+    Tab-delimited text file containing statistics (per column) that were generated
+    from the STAR log files (Log.final.out, Summary.csv, ReadsPerGene.out.tab).
+    Each entry (row) in the file describes the values for one well (barcode).
+  required: true
+- name: "--nrReadsNrGenesPerChromPool"
+  type: file
+  description: |
+    Pivot table in tsv format of the combined nrReadsNrGenesPerChrom files from STAR. 
+    Describes per chromosome (as columns) the number of reads, as well as the total number 
+    of reads per cell barcode and the percentage of nuclear, ERCC and mitochondrial
+    reads.
+  required: true
+- name: "--output"
+  type: file
+  direction: output
+  default: pData.$id.txt
+resources:
+- type: python_script
+  path: create_pdata.py
+test_resources:
+- type: python_script
+  path: test.py
+- path: nrReadsNrGenesPerChromPool.txt
+- path: starLogs.txt
+
+engines:
+  - type: docker
+    image: python:3.12-slim
+    setup:
+      - type: apt
+        packages:
+          - procps
+      - type: python
+        packages:
+          - pandas
+    test_setup:
+      - type: python
+        packages:
+          - viashpy
+runners:
+  - type: executable
+  - type: nextflow
--- a/src/eset/create_pdata/create_pdata.py
+++ b/src/eset/create_pdata/create_pdata.py
@@ -0,0 +1,60 @@
+from itertools import batched
+import pandas as pd
+import logging
+
+### VIASH START
+meta = {
+    "name": "create_pdata",
+}
+
+par = {
+  "star_stats_file": "src/eset/create_pdata/starLogs.txt",
+  "nrReadsNrGenesPerChromPool": "src/eset/create_pdata/nrReadsNrGenesPerChromPool.txt",
+  "output": "pData.tsv"
+}
+
+### VIASH END
+
+logger = logging.getLogger()
+console_handler = logging.StreamHandler()
+logger.addHandler(console_handler)
+logger.setLevel(logging.DEBUG)
+
+def main(par):
+  logger.info(f"{meta['name']} started.")
+  parameters_str = [f'\t{param}: {param_val}\n' for param, param_val in par.items()]
+  logger.info("Parameters:\n%s", "".join(parameters_str).rstrip())
+  logger.info("Reading %s", par["star_stats_file"])
+  star_log_stats = pd.read_csv(par["star_stats_file"], sep="\t", index_col=0)
+  logger.info("STAR log statics file contains information for the following barcodes: %s", 
+              ", ".join(star_log_stats.index))
+  logger.info("Reading %s", par["nrReadsNrGenesPerChromPool"])
+  reads_and_genes_per_chr_stats = pd.read_csv(par["nrReadsNrGenesPerChromPool"], sep="\t", index_col=0)
+  logger.info("Reads per gene and chromosome table contains information for the following barcodes: %s",
+              ", ".join(reads_and_genes_per_chr_stats.index))
+  logger.info("Filtering mapping statistics file columns.")
+  cols_to_keep = ("WellID", "NumberOfMTReads", "pctMT", "NumberOfERCCReads",
+                  "pctERCC", "NumberOfChromReads", "pctChrom")
+  try:
+    reads_and_genes_per_chr_stats = reads_and_genes_per_chr_stats.loc[:,cols_to_keep]
+  except KeyError as e:
+    raise KeyError("When trying to subset the reads per genes and chromosomes file, "
+                   "a column was missing. Available columns in the file: "
+                   f"{', '.join(reads_and_genes_per_chr_stats.columns)}.") from e
+  combined_stats = pd.concat([reads_and_genes_per_chr_stats, star_log_stats], axis=1)
+  if combined_stats.isna().any(axis=None): # For non-overlapping indices, the values get filled with NA
+    raise ValueError("Error while combining two log files. It seems that the entries (barcodes) "
+                     f"do not fully overlap. Barcodes in '{par['star_stats_file']}: "
+                     f"{', '.join(reads_and_genes_per_chr_stats.index)}. Barcodes in "
+                     f"'{par['nrReadsNrGenesPerChromPool']}': "
+                     f"{', '.join(star_log_stats.index)}")
+  logger.info("Summary of final output:\n%s\n",
+                "\n".join(repr(combined_stats.loc[:,columns].describe())
+                          for columns in batched(combined_stats.columns, 3))) 
+  logger.info("Writing to %s", par["output"])
+  combined_stats.reset_index("WellBC").to_csv(par["output"], sep="\t", header=True, index=False)
+  logger.info("Finished %s.", meta["name"])
+
+
+if __name__ == "__main__":
+  main(par)
--- a/src/eset/create_pdata/nrReadsNrGenesPerChromPool.txt
+++ b/src/eset/create_pdata/nrReadsNrGenesPerChromPool.txt
@@ -0,0 +1,8 @@
+WellBC	WellID	20	pctChrom	pctMT	pctERCC	SumReads	NumberOfGenes	NumberOfERCCReads	NumberOfChromReads	NumberOfMTReads
+AACAAGGTAC	A1	8542	100	0	0	8542	408	0	8542	0
+ACGCCTTCGT	A2	5863	100	0	0	5863	377	0	5863	0
+CCATACTGAC	A3	7396	100	0	0	7396	391	0	7396	0
+GCAAGCGAAT	B1	10092	100	0	0	10092	420	0	10092	0
+GTCTCGAGTG	C5	470	100	0	0	470	150	0	470	0
+TGCGCTCATT	D6	7650	100	0	0	7650	407	0	7650	0
+TTGTGTTCGA	E19	9422	100	0	0	9422	420	0	9422	0
--- a/src/eset/create_pdata/starLogs.txt
+++ b/src/eset/create_pdata/starLogs.txt
@@ -0,0 +1,8 @@
+WellBC	NumberOfInputReads	NumberOfMappedReads	PctMappedReads	NumberOfReadsMappedToMultipleLoci	PectOfReadsMappedToMultipleLoci	NumberOfReadsMappedToTooManyLoci	PectOfReadsMappedToTooManyLoci	NumberOfReadsUnmappedTooManyMismatches	PectOfReadsUnmappedTooManyMismatches	NumberOfReadsUnmappedTooShort	PectOfReadsUnmappedTooShort	NumberOfReadsUnmappedOther	PectOfReadsUnmappedOther	ReadsWithValidBarcodes	SequencingSaturation	Q30BasesInCB+UMI	ReadsMappedToTranscriptome:Unique+MultipeGenes	EstimatedNumberOfCells	FractionOfReadsInCells	MeanReadsPerCell	NumberOfUMIs	NumberOfGenes	NumberOfCountedReads
+ACGCCTTCGT	96430	16869	17.49	0	0	6124	6.35	0	0	73375	76.09	62	0.06	0.999782	0.0665302	0.980077	0.0620969	1	1	5862	5472	377	6463
+GTCTCGAGTG	10158	1902	18.72	0	0	967	9.52	0	0	7280	71.67	9	0.09	0.999803	0.0553191	0.984451	0.0476472	1	1	470	444	150	533
+GCAAGCGAAT	156134	24005	15.37	0	0	7961	5.1	0	0	124096	79.48	72	0.05	0.999744	0.0680872	0.982779	0.0658665	1	1	10090	9403	420	11273
+CCATACTGAC	113577	17319	15.25	0	0	5905	5.2	0	0	90292	79.5	61	0.05	0.999859	0.0717282	0.982313	0.066554	1	1	7389	6859	391	8299
+TGCGCTCATT	126989	19272	15.18	0	0	7141	5.62	0	0	100515	79.15	61	0.05	0.999843	0.0667974	0.986581	0.0616668	1	1	7650	7139	407	8444
+TTGTGTTCGA	142560	22129	15.52	0	0	7045	4.94	0	0	113324	79.49	62	0.04	0.999783	0.060828	0.986622	0.0676838	1	1	9420	8847	420	10383
+AACAAGGTAC	141303	23749	16.81	0	0	8458	5.99	0	0	109035	77.16	61	0.04	0.999816	0.0698056	0.979965	0.0618175	1	1	8538	7942	408	9535
--- a/src/eset/create_pdata/test.py
+++ b/src/eset/create_pdata/test.py
@@ -0,0 +1,99 @@
+import pytest
+import sys
+import pandas as pd
+from pathlib import Path
+from uuid import uuid4
+
+### VIASH START
+meta = {
+    "resources_dir": "./src/eset/create_pdata/",
+    "executable": "target/executable/eset/create_pdata/create_pdata",
+    "config": "src/eset/create_pdata/config.vsh.yaml"
+}
+### VIASH END
+
+@pytest.fixture
+def test_reads_and_genes_per_chr_path():
+    return Path(meta["resources_dir"]) / "nrReadsNrGenesPerChromPool.txt"
+
+
+@pytest.fixture
+def test_star_logs_summary_path():
+    return Path(meta["resources_dir"]) / "starLogs.txt"
+
+
+@pytest.fixture
+def random_path(tmp_path):
+    def wrapper(extension=None):
+        extension = "" if not extension else f".{extension}"
+        return tmp_path / f"{uuid4()}{extension}"
+    return wrapper 
+
+
+def test_create_fdata(run_component, test_reads_and_genes_per_chr_path,
+                      test_star_logs_summary_path, random_path):
+    output_path = random_path("tsv")
+    run_component([
+        "--star_stats_file", test_star_logs_summary_path,
+        "--nrReadsNrGenesPerChromPool", test_reads_and_genes_per_chr_path, 
+        "--output", output_path
+    ])
+    assert output_path.is_file()
+    result = pd.read_csv(output_path, sep="\t", dtype=pd.StringDtype())
+    expected_dict = {
+        'WellBC': ['AACAAGGTAC', 'ACGCCTTCGT', 'CCATACTGAC', 'GCAAGCGAAT',
+                   'GTCTCGAGTG', 'TGCGCTCATT', 'TTGTGTTCGA'],
+        'WellID': ['A1', 'A2', 'A3', 'B1', 'C5', 'D6', 'E19'],
+        'NumberOfMTReads': ['0', '0', '0', '0', '0', '0', '0'],
+        'pctMT': ['0', '0', '0', '0', '0', '0', '0'],
+        'NumberOfERCCReads': ['0', '0', '0', '0', '0', '0', '0'],
+        'pctERCC': ['0', '0', '0', '0', '0', '0', '0'],
+        'NumberOfChromReads': ['8542', '5863', '7396', '10092', '470',
+                               '7650', '9422'],
+        'pctChrom': ['100', '100', '100', '100', '100', '100', '100'],
+        'NumberOfInputReads': ['141303', '96430', '113577', '156134', '10158',
+                               '126989', '142560'],
+        'NumberOfMappedReads': ['23749', '16869', '17319', '24005', '1902',
+                                '19272', '22129'],
+        'PctMappedReads': ['16.81', '17.49', '15.25', '15.37', '18.72',
+                           '15.18', '15.52'],
+        'NumberOfReadsMappedToMultipleLoci': ['0', '0', '0', '0', '0', '0', '0'],
+        'PectOfReadsMappedToMultipleLoci': ['0', '0', '0', '0', '0', '0', '0'],
+        'NumberOfReadsMappedToTooManyLoci': ['8458', '6124', '5905', '7961', '967',
+                                             '7141', '7045'],
+        'PectOfReadsMappedToTooManyLoci': ['5.99', '6.35', '5.2', '5.1', '9.52',
+                                           '5.62', '4.94'],
+        'NumberOfReadsUnmappedTooManyMismatches': ['0', '0', '0', '0', '0', '0', '0'],
+        'PectOfReadsUnmappedTooManyMismatches': ['0', '0', '0', '0', '0', '0', '0'],
+        'NumberOfReadsUnmappedTooShort': ['109035', '73375', '90292', '124096',
+                                          '7280', '100515', '113324'],
+        'PectOfReadsUnmappedTooShort': ['77.16', '76.09', '79.5', '79.48',
+                                        '71.67', '79.15', '79.49'],
+        'NumberOfReadsUnmappedOther': ['61', '62', '61', '72', '9', '61', '62'],
+        'PectOfReadsUnmappedOther': ['0.04', '0.06', '0.05', '0.05',
+                                     '0.09', '0.05', '0.04'],
+        'ReadsWithValidBarcodes': ['0.999816', '0.999782', '0.999859', '0.999744',
+                                   '0.999803', '0.999843', '0.999783'],
+        'SequencingSaturation': ['0.0698056', '0.0665302', '0.0717282', '0.0680872',
+                                 '0.0553191', '0.0667974', '0.060828'],
+        'Q30BasesInCB+UMI': ['0.979965', '0.980077', '0.982313', '0.982779',
+                             '0.984451', '0.986581', '0.986622'],
+        'ReadsMappedToTranscriptome:Unique+MultipeGenes': ['0.0618175', '0.0620969',
+                                                           '0.066554', '0.0658665',
+                                                           '0.0476472', '0.0616668',
+                                                           '0.0676838'],
+        'EstimatedNumberOfCells': ['1', '1', '1', '1', '1', '1', '1'],
+        'FractionOfReadsInCells': ['1', '1', '1', '1', '1', '1', '1'],
+        'MeanReadsPerCell': ['8538', '5862', '7389',
+                             '10090', '470', '7650', '9420'],
+        'NumberOfUMIs': ['7942', '5472', '6859', '9403',
+                         '444', '7139', '8847'],
+        'NumberOfGenes': ['408', '377', '391', '420', '150', '407', '420'],
+        'NumberOfCountedReads': ['9535', '6463', '8299', '11273',
+                                 '533', '8444', '10383']
+    }
+    expected = pd.DataFrame.from_dict(expected_dict, dtype=pd.StringDtype())
+    pd.testing.assert_frame_equal(result, expected, check_like=True)
+
+if __name__ == '__main__':
+    sys.exit(pytest.main([__file__]))
--- a/src/integration_test_components/htrnaseq/check_eset/config.vsh.yaml
+++ b/src/integration_test_components/htrnaseq/check_eset/config.vsh.yaml
@@ -0,0 +1,34 @@
+name: "check_eset"
+namespace: "integration_test_components/htrnaseq"
+description: "This component test the ExpressionSet object as output by the main pipeline."
+authors:
+  - __merge__: /src/base/authors/dries_schaumont.yaml
+    roles: [ author, maintainer ]
+argument_groups:
+  - name: Inputs
+    arguments:
+      - name: "--eset"
+        type: file
+        required: true
+        description: Path to an ExpressionSet object.
+        example: eset.rds
+      - name: "--star_output"
+        type: file
+        required: true
+        multiple: true
+resources:
+  - type: r_script
+    path: script.R
+engines:
+  - type: docker
+    image: bioconductor/bioconductor_docker:3.19
+    setup:
+      - type: r
+        cran:
+          - bit64
+        bioc:
+          - Biobase
+
+runners:
+  - type: executable
+  - type: nextflow
--- a/src/integration_test_components/htrnaseq/check_eset/script.R
+++ b/src/integration_test_components/htrnaseq/check_eset/script.R
@@ -0,0 +1,198 @@
+library(Biobase)
+library(testthat)
+library(Matrix)
+
+sample_1_result <- readRDS(par$eset)
+expected_sample_names <- c(
+  "sample_one_AACAAGGTAC", "sample_one_AACAATCAGG", "sample_one_AACACCTAGT",
+  "sample_one_AACAGGCAAT", "sample_one_AACATGGAGA", "sample_one_AACATTACCG",
+  "sample_one_AACCAGCCAG", "sample_one_AACCAGTTGA", "sample_one_AACCGCGACT",
+  "sample_one_AACCGGAAGG", "sample_one_AACCGGCGTA", "sample_one_AACCTAGTCC",
+  "sample_one_AACCTCATAG", "sample_one_AACGTAAGCT", "sample_one_AACTCTACAC",
+  "sample_one_AACTGTGTCA", "sample_one_AAGACGGATT", "sample_one_AAGATCGGCG",
+  "sample_one_AAGATGTCCA", "sample_one_AAGCATATGG", "sample_one_AAGCGATGTT",
+  "sample_one_AAGCGTTCAG", "sample_one_AAGCTCACCT", "sample_one_AAGGCATGCG",
+  "sample_one_AAGGTCTGGA", "sample_one_AAGTTAGCGC", "sample_one_AAGTTCCTTG",
+  "sample_one_AATACCGGTA", "sample_one_AATAGCCACA", "sample_one_AATCACGCGA",
+  "sample_one_AATCCATCTG", "sample_one_AATCCGCTCC", "sample_one_AATCCTACCA",
+  "sample_one_AATCGTCCGC", "sample_one_AATGAACACG", "sample_one_AATGACCTTC",
+  "sample_one_AATGAGAGCA", "sample_one_AATGTCAGTG", "sample_one_AATTAGGCCG",
+  "sample_one_AATTGCGATG", "sample_one_ACAACAGTCG", "sample_one_ACAACCATAC",
+  "sample_one_ACAACGGAGC", "sample_one_ACAAGCGCGA", "sample_one_ACACAATCTC",
+  "sample_one_ACACAGTGAA", "sample_one_ACACCGAATT", "sample_one_ACACGCAGTA",
+  "sample_one_ACACGGTCCT", "sample_one_ACACTTGCTG", "sample_one_ACAGTGCCAA",
+  "sample_one_ACATGTGTGC", "sample_one_ACCAGGACCA", "sample_one_ACCATAACAC",
+  "sample_one_ACCGAACCGT", "sample_one_ACCGAGAGTC", "sample_one_ACCGGTACAG",
+  "sample_one_ACCGTACTTC", "sample_one_ACCTCCGACA", "sample_one_ACCTCTCTCC",
+  "sample_one_ACCTGTCCGA", "sample_one_ACCTTATGTG", "sample_one_ACGAATGACA",
+  "sample_one_ACGCCTCAAC", "sample_one_ACGCCTTCGT", "sample_one_ACGCTGGATA",
+  "sample_one_ACGGTCCGTT", "sample_one_ACGTAGGCAC", "sample_one_ACGTGCTGAT",
+  "sample_one_ACTCCAAGCC", "sample_one_ACTGGCGCAT", "sample_one_ACTGGCTTCC",
+  "sample_one_ACTTAACTGC", "sample_one_ACTTCATCAC", "sample_one_ACTTCGTTGA",
+  "sample_one_ACTTCTCCTG", "sample_one_ACTTGAGGAA", "sample_one_ACTTGTAAGG",
+  "sample_one_AGAACCACGG", "sample_one_AGAAGCAATC", "sample_one_AGACCGTTAT",
+  "sample_one_AGACTAGCAT", "sample_one_AGAGATGCAG", "sample_one_AGAGCTTACA",
+  "sample_one_AGAGTGTAAC", "sample_one_AGAGTTCTGC", "sample_one_AGATAGTGCT",
+  "sample_one_AGCAATGCGC", "sample_one_AGCATGTCAT", "sample_one_AGCCACTAGC",
+  "sample_one_AGCCAGAATA", "sample_one_AGCCAGCTCT", "sample_one_AGCGATAACG",
+  "sample_one_AGCGTACAAT", "sample_one_AGCTATTCCA", "sample_one_AGCTCCTCAG",
+  "sample_one_AGGAGGCATA", "sample_one_AGGCGTCTGT", "sample_one_AGTAACTCAC",
+  "sample_one_AGTAAGCGTT", "sample_one_AGTCTGTACG", "sample_one_AGTGCAATGT",
+  "sample_one_ATAAGGTGCA", "sample_one_ATACACGACA", "sample_one_ATAGGCCATT",
+  "sample_one_ATATCCGCAT", "sample_one_ATCAGCACTT", "sample_one_ATCAGCGAGG",
+  "sample_one_ATCCAATACG", "sample_one_ATCCGCTGTG", "sample_one_ATCCGTCCAT",
+  "sample_one_ATCGACGGCT", "sample_one_ATCGCGATTA", "sample_one_ATCGGTAGGC",
+  "sample_one_ATCTAAGGAG", "sample_one_ATGACGGTAA", "sample_one_ATGACTCAGT",
+  "sample_one_ATGCACCGGA", "sample_one_ATGCGGACTG", "sample_one_ATGCTTCCTA",
+  "sample_one_ATGGACCAAC", "sample_one_ATGGTCTTAG", "sample_one_ATGGTGAGCG",
+  "sample_one_ATGTGGAAGC", "sample_one_ATTATCGGAC", "sample_one_ATTCGGAACA",
+  "sample_one_CAACAATCCA", "sample_one_CAAGAAGCAT", "sample_one_CAAGATGAGG",
+  "sample_one_CAAGCCAACG", "sample_one_CAAGTGGATC", "sample_one_CACAGTTCAT",
+  "sample_one_CACGAGTCTG", "sample_one_CACGCTCCAA", "sample_one_CACTGAGCAC",
+  "sample_one_CAGATCAATG", "sample_one_CAGTGCTCTT", "sample_one_CAGTTAAGCA",
+  "sample_one_CATAGCTATC", "sample_one_CATCACCACC", "sample_one_CATGTACGCC",
+  "sample_one_CATTACACTG", "sample_one_CATTCGACGA", "sample_one_CCAACTATGG",
+  "sample_one_CCAAGGAGTT", "sample_one_CCAATTGTTC", "sample_one_CCACAAGTGC",
+  "sample_one_CCAGCTTAGT", "sample_one_CCATAACTTG", "sample_one_CCATACTGAC",
+  "sample_one_CCATAGATCA", "sample_one_CCATGTGCTT", "sample_one_CCATTCAGCG",
+  "sample_one_CCGAACAAGC", "sample_one_CCGAACCTAA", "sample_one_CCGAAGACCT",
+  "sample_one_CCGAATAGTG", "sample_one_CCGACTTCTC", "sample_one_CCGATCCACT",
+  "sample_one_CCGATGATAC", "sample_one_CCGCGTTATG", "sample_one_CCGCTAGCTT",
+  "sample_one_CCGGAGTATC", "sample_one_CCGGCCAATT", "sample_one_CCGGTCTCTA",
+  "sample_one_CCGTACGATG", "sample_one_CCGTCAGAAC", "sample_one_CCTAGACACG",
+  "sample_one_CCTAGTTGAG", "sample_one_CCTATTCTGT", "sample_one_CCTCAACCGA",
+  "sample_one_CCTCCATAAG", "sample_one_CCTGATGCCA", "sample_one_CCTGCAATAC",
+  "sample_one_CCTTGTATTC", "sample_one_CGAGATCTCT", "sample_one_CGAGGAACAA",
+  "sample_one_CGATAACCGC", "sample_one_CGATCCTGTG", "sample_one_CGCCAACCAT",
+  "sample_one_CGCCAGTGTT", "sample_one_CGCCTTGTAC", "sample_one_CGCGGATTCA",
+  "sample_one_CGCTTAAGGC", "sample_one_CGCTTACTAA", "sample_one_CGCTTCTTGG",
+  "sample_one_CGGAAGCTGT", "sample_one_CGGAATACAC", "sample_one_CGGAGATTGG",
+  "sample_one_CGGAGCTCAA", "sample_one_CGGATCGGTA", "sample_one_CGGATTCTAG",
+  "sample_one_CGGCAACTTA", "sample_one_CGGCTCATCA", "sample_one_CGGTCGTATT",
+  "sample_one_CGGTGACATC", "sample_one_CGTAACGGAT", "sample_one_CGTAAGATTC",
+  "sample_one_CGTACTGTAA", "sample_one_CGTAGAAGAC", "sample_one_CGTCCTAGGA",
+  "sample_one_CGTCGGCAAT", "sample_one_CGTGAGTTAT", "sample_one_CGTGTCAAGC",
+  "sample_one_CTAACTTCAG", "sample_one_CTAATAGCGT", "sample_one_CTACACCAGG",
+  "sample_one_CTAGCACAAT", "sample_one_CTATGAACGG", "sample_one_CTCAAGGACC",
+  "sample_one_CTCACCTGTC", "sample_one_CTCCTATTGT", "sample_one_CTCGCAACGT",
+  "sample_one_CTCGTGCCTA", "sample_one_CTGGATTGAC", "sample_one_CTGTAGTCAG",
+  "sample_one_CTGTCGCTTC", "sample_one_CTGTCTGTGT", "sample_one_CTTCATATCG",
+  "sample_one_CTTGCTGACG", "sample_one_GAAGGATTAG", "sample_one_GAATCGAGCC",
+  "sample_one_GACCATCTAA", "sample_one_GACGACCACA", "sample_one_GAGACATCTT",
+  "sample_one_GAGCGAGTCA", "sample_one_GAGTAGACCA", "sample_one_GATACGCTTA",
+  "sample_one_GATAGACTGT", "sample_one_GATAGAGGCG", "sample_one_GATAGGTCAA",
+  "sample_one_GATATCAGGA", "sample_one_GATCTCATTC", "sample_one_GATCTGGTCG",
+  "sample_one_GATGAGTGAC", "sample_one_GATGGATACA", "sample_one_GATGTGACAG",
+  "sample_one_GATTAAGTCC", "sample_one_GATTGCACGC", "sample_one_GCAAGCGAAT",
+  "sample_one_GCAATGTAAG", "sample_one_GCACACTATA", "sample_one_GCACTCGGAA",
+  "sample_one_GCACTGCGTT", "sample_one_GCACTTAATC", "sample_one_GCAGGAGATG",
+  "sample_one_GCAGTACTGG", "sample_one_GCATATGAGT", "sample_one_GCATCCGATC",
+  "sample_one_GCCAAGTACA", "sample_one_GCCACGATTC", "sample_one_GCCATAGGTT",
+  "sample_one_GCCATATCGA", "sample_one_GCCGTCAATA", "sample_one_GCCTGGACAT",
+  "sample_one_GCGTAATTAC", "sample_one_GCTATTATCC", "sample_one_GCTCAGTAAT",
+  "sample_one_GCTGCTTATA", "sample_one_GGAATAAGCA", "sample_one_GGACGATGCT",
+  "sample_one_GGCATCGTGA", "sample_one_GGCATTATTG", "sample_one_GGCCGAGATT",
+  "sample_one_GGCGCTATAA", "sample_one_GGCGTTAAGT", "sample_one_GGCTATTGAT",
+  "sample_one_GGCTGCTACT", "sample_one_GGTAATGTGT", "sample_one_GGTGGTTGGA",
+  "sample_one_GGTGTTCACC", "sample_one_GGTTAGATCT", "sample_one_GGTTATGGCG",
+  "sample_one_GGTTCACTGG", "sample_one_GGTTGTGCAA", "sample_one_GTAACCAGTA",
+  "sample_one_GTAACCTTGG", "sample_one_GTAAGAACCT", "sample_one_GTAAGGCTCC",
+  "sample_one_GTAATCCACG", "sample_one_GTATTGTGGA", "sample_one_GTCCGCATCA",
+  "sample_one_GTCCTTCGGT", "sample_one_GTCGCTCTCT", "sample_one_GTCGGTGACA",
+  "sample_one_GTCTCGAGTG", "sample_one_GTCTCTTAAG", "sample_one_GTCTTCCGAG",
+  "sample_one_GTGACTATAC", "sample_one_GTGGTTAATG", "sample_one_GTGTGCCTGT",
+  "sample_one_GTGTGTGTCC", "sample_one_GTTCATTGCC", "sample_one_GTTCCGGTGA",
+  "sample_one_GTTCGTCGAA", "sample_one_GTTGAATTGG", "sample_one_GTTGATCCGC",
+  "sample_one_GTTGTATGCT", "sample_one_TAACCGTAGC", "sample_one_TAACGTCGAT",
+  "sample_one_TAAGGTACGG", "sample_one_TACGGACATA", "sample_one_TACTACCGCC",
+  "sample_one_TACTGTCAAG", "sample_one_TAGCGAACGC", "sample_one_TAGCGCCAAC",
+  "sample_one_TAGGACGCCT", "sample_one_TAGGTTGCAA", "sample_one_TAGTAGTCTC",
+  "sample_one_TAGTCCGCTG", "sample_one_TAGTGGAACT", "sample_one_TATCATGCAG",
+  "sample_one_TATCGTTACG", "sample_one_TCAAGTGCAG", "sample_one_TCACAGATAC",
+  "sample_one_TCACCGCCTA", "sample_one_TCACGCCACT", "sample_one_TCACGTTGGC",
+  "sample_one_TCATTGTCCA", "sample_one_TCCACACTAG", "sample_one_TCCACGGTCA",
+  "sample_one_TCCACTCGCT", "sample_one_TCCGACTAAC", "sample_one_TCCGTTATCT",
+  "sample_one_TCCTAAGAGA", "sample_one_TCCTCTAGTA", "sample_one_TCGAAGCATT",
+  "sample_one_TCGAGAGAGC", "sample_one_TCGCACTTGA", "sample_one_TCGCCTACTG",
+  "sample_one_TCGCGTAGCA", "sample_one_TCGGCGTTAA", "sample_one_TCTACATCCG",
+  "sample_one_TCTCCACATT", "sample_one_TCTCTCCTAT", "sample_one_TCTTGCTCGG",
+  "sample_one_TGAACTAACC", "sample_one_TGAAGAAGGT", "sample_one_TGAGCGTTCC",
+  "sample_one_TGAGTACGTA", "sample_one_TGGAATGGAG", "sample_one_TGTCATTCGC",
+  "sample_one_TGTGCTTCAG", "sample_one_TGTTCAGGAT", "sample_one_TTACACACGT",
+  "sample_one_TTACTGTGAC", "sample_one_TTATAGGAGG", "sample_one_TTATCGCGTT",
+  "sample_one_TTATGCCGCG", "sample_one_TTCACGGAAG", "sample_one_TTCAGGAGTA",
+  "sample_one_TTCCATCGAG", "sample_one_TTCGAGTGAT", "sample_one_TTCTGTACCT",
+  "sample_one_TTGGCAATTC", "sample_one_TTGGCTCCAC", "sample_one_TTGGTAACAG",
+  "sample_one_TTGGTCAGTA", "sample_one_TTGTCGGCCA", "sample_one_TTGTGTTCGA"
+)
+stopifnot(identical(sampleNames(sample_1_result), expected_sample_names))
+
+expected_var_labels <- c(
+  "WellBC",
+  "WellID",
+  "NumberOfMTReads",
+  "pctMT",
+  "NumberOfERCCReads",
+  "pctERCC",
+  "NumberOfChromReads",
+  "pctChrom",
+  "NumberOfInputReads",
+  "NumberOfMappedReads",
+  "PctMappedReads",
+  "NumberOfReadsMappedToMultipleLoci",
+  "PectOfReadsMappedToMultipleLoci",
+  "NumberOfReadsMappedToTooManyLoci",
+  "PectOfReadsMappedToTooManyLoci",
+  "NumberOfReadsUnmappedTooManyMismatches",
+  "PectOfReadsUnmappedTooManyMismatches",
+  "NumberOfReadsUnmappedTooShort",
+  "PectOfReadsUnmappedTooShort",
+  "NumberOfReadsUnmappedOther",
+  "PectOfReadsUnmappedOther",
+  "ReadsWithValidBarcodes",
+  "SequencingSaturation",
+  "Q30BasesInCB.UMI",
+  "ReadsMappedToTranscriptome.Unique.MultipeGenes",
+  "EstimatedNumberOfCells",
+  "FractionOfReadsInCells",
+  "MeanReadsPerCell",
+  "NumberOfUMIs",
+  "NumberOfGenes",
+  "NumberOfCountedReads",
+  "PoolName"
+)
+stopifnot(identical(varLabels(sample_1_result), expected_var_labels))
+
+read_mm <- function(mapping_dir) {
+  market_matrix_file <- file.path(mapping_dir, "Solo.out",
+                                  "Gene", "raw", "matrix.mtx")
+  result <- readMM(market_matrix_file)
+  feature_file <- file.path(mapping_dir, "Solo.out",
+                            "Gene", "raw", "features.tsv")
+  features <- read.table(feature_file, sep = "\t", header = FALSE,
+                         col.names = c("ID", "Name", "Type"))$ID
+  rownames(result) <- gsub("\\.\\d+$", "", features)
+  barcodes_file <- file.path(mapping_dir,
+                             "Solo.out", "Gene", "raw", "barcodes.tsv")
+  if (!file.exists(barcodes_file)) {
+    stop(paste0("Expected the 'Solo.out/Gene/raw' directory at ",
+                mapping_dir, " to contain a 'barcodes.tsv' file."))
+  }
+  barcodes <- readLines(barcodes_file)
+  if (length(barcodes) != 1) {
+    stop(paste0("A single STAR Solo folder should only have ",
+                "mapped one (1) barcode, but found '",
+                length(barcodes), "'for mapping directory ", mapping_dir))
+  }
+  colnames(result) <- paste0("sample_one_", barcodes)
+  return(result)
+}
+expected_matrices <- lapply(par$star_output, read_mm)
+expected_matrix <- as.matrix(do.call(cbind, expected_matrices))
+result_counts <- exprs(sample_1_result)
+stopifnot(length(setdiff(colnames(expected_matrix),
+                         colnames(exprs(sample_1_result)))) == 0)
+stopifnot(length(setdiff(rownames(expected_matrix),
+                         rownames(exprs(sample_1_result)))) == 0)
+expected_matrix_sorted <- expected_matrix[, colnames(exprs(sample_1_result))]
+stopifnot(identical(exprs(sample_1_result), expected_matrix_sorted))
--- a/src/parallel_map/STAR
+++ b/src/parallel_map/STAR
--- a/src/parallel_map/config.vsh.yaml
+++ b/src/parallel_map/config.vsh.yaml
@@ -0,0 +1,110 @@
+name: parallel_map
+description: |
+  Map wells in batch, using STAR
+  Spliced Transcripts Alignment to a Reference (C) Alexander Dobin
+  https://github.com/alexdobin/STAR
+authors:
+  - __merge__: /src/base/authors/dries_schaumont.yaml
+    roles: [ maintainer ]
+  - __merge__: /src/base/authors/toni_verbeiren.yaml
+    roles: [ author, maintainer ]
+requirements:
+  commands:
+    - STAR
+    - file
+    - parallel
+argument_groups:
+  - name: Input arguments
+    arguments:
+    - name: "--input_r1"
+      type: file
+      required: true
+      multiple: true
+    - name: "--input_r2"
+      type: file
+      required: true
+      multiple: true
+    - name: "--genomeDir"
+      type: file
+      required: true
+      description: STAR reference directory
+    - name: "--barcodes"
+      type: string 
+      multiple: true
+      required: true
+      description: The barcodes/wells to process
+  - name: Barcode arguments
+    arguments:
+    - name: "--wellBarcodesLength"
+      type: integer
+      required: true
+      description: The length of the well barcodes
+    - name: "--umiLength"
+      type: integer
+      required: true
+      description: The length of the UMIs
+    - name: "--limitBAMsortRAM"
+      type: string
+      default: "10000000000"
+  - name: Runtime arguments
+    arguments:
+    - name: "--runThreadN"
+      description: "Number of threads to use for a single STAR execution."
+      type: integer
+      default: 1
+  - name: Output arguments
+    arguments:
+    - name: "--output"
+      type: file
+      description: |
+        Location of the output folders, 1 folder per barcode. The value used
+        for this argument must contain a '*', which will be replaced with the
+        barcode to form the final output location for that barcode.
+      required: true
+      multiple: true
+      direction: output
+      default: './*'
+    - name: "--joblog"
+      type: file
+      description: Where to store the log file listing all the jobs.
+      required: false
+      direction: output
+      default: "execution_log.txt"
+
+resources:
+- type: bash_script
+  path: script.sh
+- path: STAR
+
+test_resources:
+  - type: bash_script
+    path: test.sh
+
+engines:
+  - type: docker
+    image: debian:stable-slim
+    setup:
+      - type: apt
+        packages:
+          - procps
+          - wget
+          - automake
+          - make
+          - gcc
+          - g++
+          - zlib1g-dev
+          - parallel
+          - file
+      - type: docker
+        build_args:
+          - STAR_V=2.7.6a
+        env:
+          - STAR_SOURCE="https://github.com/alexdobin/STAR/archive/refs/tags/$STAR_V.tar.gz"
+          - STAR_TARGET="/app/star-$STAR_V.tar.gz"
+          - STAR_INSTALL_DIR="/app/STAR-$STAR_V"
+          - STAR_BINARY=STAR
+        copy:
+          - STAR /usr/local/bin/$STAR_BINARY
+runners:
+  - type: executable
+  - type: nextflow
--- a/src/parallel_map/script.sh
+++ b/src/parallel_map/script.sh
@@ -0,0 +1,292 @@
+#!/bin/bash
+
+## VIASH START
+par_input_r1="work/2c/5b8b3a2dd4a988b8838e3f72d38a37/_viash_par/input_r1_1/two__ACACCGAATT.concat_text_r1.output.txt"
+par_input_r2="work/2c/5b8b3a2dd4a988b8838e3f72d38a37/_viash_par/input_r2_1/two__ACACCGAATT.concat_text_r2.output.txt"
+par_barcodes="ACACCGAATT;GGCTATTGAT"
+par_output="./*"
+par_genomeDir="star"
+par_wellBarcodesLength=10
+par_umiLength=10
+par_limitBAMsortRAM="10000000000"
+meta_cpus=2
+par_runThreadN=1
+## VIASH END
+
+set -eo pipefail
+
+# Check if wildcard character is present in output folder template
+printf "Checking if output folder template ($par_output) contains a single wildcard character '*'. "
+output_glob_character="${par_output//[^\*]}"
+if [[ "${#output_glob_character}" -ne "1" ]]; then
+  echo "The value for --output must contain exactly one '*' character. Exiting..."
+  exit 1
+else
+  echo "Done, wildcard character found!"
+fi
+
+# Split the delimited strings into arrays
+IFS=';' read -r -a barcodes <<< "$par_barcodes"
+IFS=';' read -r -a input_r1 <<< "$par_input_r1"
+IFS=';' read -r -a input_r2 <<< "$par_input_r2"
+
+# Check that the number of values provided for the barcodes and the fastq files are the same.
+num_barcodes="${#barcodes[@]}"
+num_r1_inputs="${#input_r1[@]}"
+num_r2_inputs="${#input_r2[@]}"
+
+if [ ! "$num_barcodes" -eq "$num_r1_inputs" ] || [ ! "$num_r1_inputs" -eq "$num_r2_inputs" ]; then
+  echo "The number of values for arguments 'barcodes' ($num_barcodes), "\
+        "'input_r1' ($num_r1_inputs) and 'input_r2' ($num_r2_inputs) "\
+        "should be the same, and their order should match."
+  exit 1
+else
+  echo "Checked if length of barcodes input ($num_barcodes) is "\
+       "the same as R1 reads ($num_r1_inputs) and R2 reads "\
+       "($num_r2_inputs). Seems OK!"
+fi
+
+# Function to test for unique values in array
+function arrayContainsUniqueValues {
+  # Pass the argument by reference
+  local -n arr=$1
+  # Create a temporary associative array
+  # in order to use its uniqueness of keys
+  # 'declare' in a function is automatically local
+  declare -A uniq_tmp
+  for item in "${arr[@]}"; do
+    uniq_tmp[$item]=0 # assigning a placeholder
+  done
+  local unique_array_values=(${!uniq_tmp[@]})
+  if [ "${#unique_array_values[@]}" -eq "${#arr[@]}" ]; then
+    return
+  fi
+  false
+}
+arrayContainsUniqueValues barcodes
+is_array_unique_exit_code=$?
+if ! (exit $is_array_unique_exit_code); then 
+  echo "The provided barcodes should be unique!"
+  echo "Values: $par_barcodes"
+  exit 1
+fi
+
+# Define the function that will be used to run a single job
+function _run() {
+  local par_wellBarcodeLength="$1"
+  local par_UMIlength="$2"
+  local par_output="$3"
+  local par_genomeDir="$4"
+  local par_limitBAMsortRAM="$5"
+  local par_runThreadN="$6"
+  local barcode="$7"
+  local input_R1="$8"
+  local input_R2="$9"
+  local par_UMIstart=$(($par_wellBarcodeLength + 1))
+
+  set -eo pipefail
+
+  echo <<-EOF
+    Processing $barcode
+    For the following inputs (lanes):
+    "$star_readFilesIn
+	EOF
+
+  echo "Writing barcode '$barcode' to $barcode.txt and using it as input".
+  # Note that there is no possible conflict between jobs here
+  # because the barcodes are unique (and the barcode is part of the name
+  # of the file).
+  echo "$barcode" > "$barcode.txt"
+
+  local dir="${par_output//\*/$barcode}/"
+  echo "Setting output for barcode '$barcode' to '$dir'."
+  mkdir -p "$dir"
+
+  # check if files are compressed
+  local TMPDIR=$(mktemp -d "$meta_temp_dir/parallel_map-$barcode-XXXXXX")
+  function clean_up {
+    [[ -d "$TMPDIR" ]] && rm -r "$TMPDIR"
+  }
+  trap clean_up RETURN
+
+  # Decompress the input files when needed
+  # NOTE: for some reason, using STAR's --readFilesCommand does not always work
+  # This might be because STAR creates fifo files (see https://man7.org/linux/man-pages/man7/fifo.7.html)
+  # and this requires a filesystem that supports this. Another cause might be that the input files
+  # are symlinks. When testing this, using '--readFilesCommand "zcat"' 
+  # always produced empty BAM files, but also a succesfull exit code (0) so the problem is not reported.
+  # However, the logs showed the following error: "gzip -: unexpected end of file".
+
+  function is_gzipped {
+    printf "Checking if input '$1' (barcode '$barcode') is gzipped... "
+    if file "$1" | grep -q 'gzip'; then
+      echo "Done, detected compressed file."
+      return
+    fi
+    echo "Done, file does not need decompression."
+    false
+  }
+  
+  # Resolve symbolic links to actual file paths
+  input_R1=$(realpath $input_R1)
+  input_R2=$(realpath $input_R2)
+
+  if is_gzipped $input_R1; then
+    local compressed_file_name_r1="$(basename -- $input_R1)"
+    local uncompressed_file_r1="$TMPDIR/${compressed_file_name_r1%.gz}"
+    printf "Unpacking input to $uncompressed_file_r1... "
+    zcat "$input_R1" > "$uncompressed_file_r1"
+    echo "Decompression done."
+  else
+    local uncompressed_file_r1="$input_R1"
+  fi
+
+  if is_gzipped $input_R2; then
+    local compressed_file_name_r2="$(basename -- $input_R2)"
+    local uncompressed_file_r2="$TMPDIR/${compressed_file_name_r2%.gz}"
+    printf "Unpacking input to $uncompressed_file_r2... "
+    zcat "$input_R2" > "$uncompressed_file_r2"
+    echo "Decompression done."
+  else
+    local uncompressed_file_r2="$input_R2"
+  fi
+
+  local n_input_lines_r1=$(wc -l < "$uncompressed_file_r1")
+  local n_input_lines_r2=$(wc -l < "$uncompressed_file_r2")
+
+  printf "Checking if length of input file mates match. "
+  if (( $n_input_lines_r1 != n_input_lines_r2 )); then
+    echo "The length of file $input_R1 ($n_input_lines_r1) does not match with $input_R2 ($n_input_lines_r2)"
+    return 1
+  else
+    echo "Seems OK, $n_input_lines_r1 input lines."
+  fi
+  echo "Starting STAR for barcode '$barcode'"
+  # soloType 'Droplet' is the same as 'CB_UMI_Simple': one UMI and one cell barcode of fixed length. 
+  # By default in this mode, STAR will look for the cell barcode and the UMI int the last files specified with --readFilesIn
+  # So we need to specify R2 first and R1 second, because R1 contains the barcode and UMI.
+  # Also, you might be tempted to use '--soloBarcodeMate 1' to alter this behavior, but this requires the clipping
+  # the barcode from this mate by specifying --clip5pNbases and/or --clip3pNbases, which we do not want to do.
+  STAR \
+    --readFilesIn "$uncompressed_file_r2" "$uncompressed_file_r1" \
+    --soloType Droplet \
+    --quantMode GeneCounts \
+    --genomeLoad LoadAndKeep \
+    --limitBAMsortRAM "$par_limitBAMsortRAM" \
+    --runThreadN "$par_runThreadN" \
+    --outFilterMultimapNmax 1 \
+    --outSAMtype BAM SortedByCoordinate \
+    --soloCBstart 1 \
+    --readFilesType "Fastx" \
+    --soloCBlen "$par_wellBarcodeLength" \
+    --soloUMIstart "$par_UMIstart" \
+    --soloUMIlen "$par_UMIlength" \
+    --soloBarcodeReadLength 0 \
+    --soloStrand Unstranded \
+    --soloFeatures Gene \
+    --genomeDir "$par_genomeDir" \
+    --outReadsUnmapped Fastx \
+    --outSAMunmapped Within \
+    --outSAMattributes NH HI nM AS CR UR CB UB GX GN \
+    --soloCBwhitelist "$barcode.txt" \
+    --outFileNamePrefix "$dir" \
+    --outTmpDir "$TMPDIR/STARtemp/"
+
+  printf "Done running STAR. "
+  # Check if the number of processed reads is equal to the number of input reads
+  local n_input_reads=$(($n_input_lines_r1 / 4))
+  local nr_output_reads=$(grep -Po "Number\ of\ input\ reads \\|\W*\K\d+" "$dir/Log.final.out")
+  if (( $nr_output_reads != $n_input_reads )); then
+    echo "Not all input reads were processed for barcode $barcode."
+    return 1
+  else
+    echo "Processed $nr_output_reads reads for barcode $barcode".
+  fi
+
+  printf "Making sure that the output has the proper permissions."
+  find "$dir" -type d -exec chmod o+x {} \;
+  chmod -R o+r "$dir"
+  echo "Done"
+}
+
+# Export the function - requires bash
+export -f _run
+
+# Load reference genome
+echo "Loading reference genome"
+STAR --genomeLoad LoadAndExit --genomeDir "$par_genomeDir"
+
+# Run the concurrent jobs using GNU parallel
+
+# Make sure that parallel uses the correct shell
+export PARALLEL_SHELL="/bin/bash"
+
+# Some notes:
+#   --halt now,fail=1: instruct parallel to exit when a job has failed and kill remaining running jobs.
+#   
+# ::: is a special syntax for GNU parallel to delineate inputs
+# If multiple ::: are given, each group will be treated as an input source, and all combinations of input
+# sources will be generated. E.g. ::: 1 2 ::: a b c will result in the combinations (1,a) (1,b) (1,c) (2,a) (2,b) (2,c)
+# The delimiter :::+ (note the extra '+') links the argument to the previous argument, and one argument from each of the input
+# sources will be read.
+parallel_cmd=("parallel" "--jobs" "80%" "--verbose" "--memfree" "2G"
+              "--tmpdir" "$meta_temp_dir"
+              "--retry-failed" "--retries" "4" "--halt" "soon,fail=1"
+              "--joblog" "$par_joblog" "_run" "{}")
+
+# Arguments for which there is one value, so these will not create extra jobs
+parallel_cmd+=(":::" "$par_wellBarcodesLength" ":::" "$par_umiLength" ":::" "$par_output" ":::" "$par_genomeDir" ":::" "$par_limitBAMsortRAM" ":::" "$par_runThreadN")
+
+# Argument which in fact will cause extra jobs to be spawned, per job one item from each argument will be selected
+# Thus, these argument lists should have the same length.
+parallel_cmd+=(":::" "${barcodes[@]}" ":::+" "${input_r1[@]}" ":::+" "${input_r2[@]}")
+
+set +eo pipefail
+"${parallel_cmd[@]}"
+exit_code=$?
+set -eo pipefail
+
+echo "GNU parallel finished!"
+
+# Unload reference
+printf "Unloading reference genome. "
+STAR --genomeLoad Remove --genomeDir "$par_genomeDir"
+echo "Done!"
+
+# Exit code from GNU parallel:
+# If fail=1 is used, the exit status will be the exit status of the failing job.
+echo "Checking exit code"
+if ((exit_code>0)); then
+  # Note that the ending HERE must be indented with TAB characters (not spaces)
+  # in order to remove leading indentation
+  MESSAGE=$(
+    cat <<-HERE
+    ==================================================================
+
+    !!! An error occurred for one of the jobs.
+    Exit code of the failing job: $exit_code.
+
+    %s
+
+    ==================================================================
+
+		HERE
+  )
+  printf "$MESSAGE" "$(<$par_joblog)"
+  exit 1
+else
+  cat <<-HERE
+  ==================================================================
+
+  Mapping went fine (exit code '$exit_code'), zero errors occurred
+
+  ==================================================================
+	HERE
+
+fi
+
+
+
+
+
+
--- a/src/parallel_map/test.sh
+++ b/src/parallel_map/test.sh
@@ -0,0 +1,356 @@
+set -eo pipefail
+
+## VIASH START
+meta_executable=$(realpath "target/executable/parallel_map/parallel_map")
+## VIASH END
+
+# Some helper functions
+assert_directory_exists() {
+  [ -d "$1" ] || { echo "File '$1' does not exist" && exit 1; }
+}
+
+assert_file_exists() {
+  [ -f "$1" ] || { echo "File '$1' does not exist" && exit 1; }
+}
+
+assert_file_contains() {
+  grep -q "$2" "$1" || { echo "File '$1' does not contain '$2'" && exit 1; }
+}
+
+assert_file_contains_regex() {
+  grep -q -E "$2" "$1" || { echo "File '$1' does not contain '$2'" && exit 1; }
+}
+
+echo "> Prepare test data in $meta_temp_dir"
+TMPDIR=$(mktemp -d --tmpdir="$meta_temp_dir")
+function clean_up {
+  [[ -d "$TMPDIR" ]] && rm -r "$TMPDIR"
+}
+trap clean_up EXIT
+
+# Sample 1, barcode ACAGTCACAG, UMI CTACGGATGA
+cat > "$TMPDIR/sample1_R1.fastq" <<'EOF'
+@SAMPLE_1_SEQ_ID1
+ACAGTCACAGCTACGGATGAGCCTCATAAGCCTCACACATCCGCGCCTATGTTGTGACTCTCTGTGAG
+
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+@SAMPLE_1_SEQ_ID2
+ACAGTCACAGCTACGGATGAGCCTCATAAGCCTCACACATCCGCGCCTATGTTGTGACTCTCTGTGAG
+
+!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
+EOF
+
+cat > "$TMPDIR/sample1_R2.fastq" <<'EOF'
+@SAMPLE_1_SEQ_ID1
+CTCACAGAGAGTCACAACATAGGCGCGGATGTGTGAGGCTTATGAGGC
+
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+@SAMPLE_1_SEQ_ID2
+CTCACAGAGAGTCACAACATAGGCGCGGATGTGTGAGGCTTATGAGGC
+
+!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
+EOF
+
+# Sample 2, barcode CGGGTTTACC, UMI GCTAGCTAGC
+cat > "$TMPDIR/sample2_R1.fastq" << 'EOF'
+@SAMPLE_2_SEQ_ID1
+CGGGTTTACCGCTAGCTAGCCACCACTATGGTTGGCCGGTTAGTAGTGT
+
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+@SAMPLE_2_SEQ_ID2
+CGGGTTTACCGCTAGCTAGCCACCACTATGGTTGGCCGGTTAGTAGTGT
+
+!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
+EOF
+
+cat > "$TMPDIR/sample2_R2.fastq" <<'EOF'
+@SAMPLE_2_SEQ_ID1
+ACACTACTAACCGGCCAACCATAGTGGTG
+
+IIIIIIIIIIIIIIIIIIIIIIIIIIIII
+@SAMPLE_2_SEQ_ID2
+ACACTACTAACCGGCCAACCATAGTGGTG
+
+!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
+EOF
+
+# Note that there is a sjdbGTFchrPrefix argument for STAR:
+# prefix for chromosome names in a GTF file (default: '-')
+cat > "$TMPDIR/genome.fasta" <<'EOF'
+>1
+TGGCATGAGCCAACGAACGCTGCCTCATAAGCCTCACACATCCGCGCCTATGTTGTGACTCTCTGTGAGCGTTCGTGGG
+GCTCGTCACCACTATGGTTGGCCGGTTAGTAGTGTGACTCCTGGTTTTCTGGAGCTTCTTTAAACCGTAGTCCAGTCAA
+TGCGAATGGCACTTCACGACGGACTGTCCTTAGCTCAGGGGA
+EOF
+
+cat > "$TMPDIR/genes.gtf" <<'EOF'
+1    example_source  gene       0    72   .   +   .   gene_id "gene1"; gene_name: "GENE1;
+1    example_source  exon       20   71   .   +   .   gene_id "gene1"; gene_name: "GENE1"; exon_id: gene1_exon1;
+1    example_source  gene       80   160   .   +   .   gene_id "gene2"; gene_name: "GENE2;
+1    example_source  exon       80   159   .   +   .   gene_id "gene2"; gene_name: "GENE2"; exon_id: gene2_exon1;
+
+EOF
+
+echo "> Generate index"
+STAR \
+  ${meta_cpus:+--runThreadN $meta_cpus} \
+  --runMode genomeGenerate \
+  --genomeDir "$TMPDIR/index/" \
+  --genomeFastaFiles "$TMPDIR/genome.fasta" \
+  --sjdbGTFfile "$TMPDIR/genes.gtf" \
+  --genomeSAindexNbases 2 > /dev/null 2>&1
+
+
+echo "> Run test 1"
+run_1_dir="$TMPDIR/run_1"
+mkdir -p "$run_1_dir"
+pushd "$run_1_dir" > /dev/null
+"$meta_executable" \
+    --input_r1 "$TMPDIR/sample1_R1.fastq;$TMPDIR/sample2_R1.fastq" \
+    --input_r2 "$TMPDIR/sample1_R2.fastq;$TMPDIR/sample2_R2.fastq" \
+    --genomeDir "$TMPDIR/index/" \
+    --barcodes "ACAGTCACAG;CGGGTTTACC" \
+    --wellBarcodesLength 10 \
+    --umiLength 10 \
+    --runThreadN 2 \
+    --output "$TMPDIR/output_*" > /dev/null 2>&1 
+popd
+
+echo ">> Check if output directories exists"
+sample1_out="$TMPDIR/output_ACAGTCACAG"
+sample2_out="$TMPDIR/output_CGGGTTTACC"
+assert_directory_exists "$sample1_out"
+assert_directory_exists "$sample2_out"
+
+echo ">> Check if output files have been created"
+for sample in "$sample1_out" "$sample2_out"; do
+  assert_file_exists "$sample/Aligned.sortedByCoord.out.bam" 
+  assert_file_exists "$sample/Unmapped.out.mate1"
+  assert_file_exists "$sample/Unmapped.out.mate2"
+  assert_file_exists "$sample/Log.out"
+  assert_file_exists "$sample/Log.final.out"
+  assert_file_exists "$sample/ReadsPerGene.out.tab"
+done 
+
+
+echo ">> Check if Solo output is present"
+for sample in "$sample1_out" "$sample2_out"; do
+  assert_directory_exists "$sample1_out/Solo.out"
+  assert_directory_exists "$sample1_out/Solo.out/Gene"
+  assert_file_exists "$sample1_out/Solo.out/Barcodes.stats"
+  assert_file_exists "$sample1_out/Solo.out/Gene/raw/barcodes.tsv"
+  assert_file_exists "$sample1_out/Solo.out/Gene/raw/features.tsv"
+  assert_file_exists "$sample1_out/Solo.out/Gene/raw/matrix.mtx"
+  assert_file_exists "$sample1_out/Solo.out/Gene/filtered/barcodes.tsv"
+  assert_file_exists "$sample1_out/Solo.out/Gene/filtered/features.tsv"
+  assert_file_exists "$sample1_out/Solo.out/Gene/filtered/matrix.mtx"
+done
+
+echo ">> Check contents of output"
+echo ">>> Sample 1"
+assert_file_contains "$sample1_out/Solo.out/Barcodes.stats" "nExactMatch              2"
+assert_file_contains "$sample1_out/Log.final.out" "Uniquely mapped reads number |	2"
+assert_file_contains "$sample1_out/Log.final.out" "Number of input reads |	2"
+
+cat << EOF | cmp -s "$sample1_out/Solo.out/Gene/filtered/barcodes.tsv" || { echo "Barcodes file is different"; exit 1; }
+ACAGTCACAG
+EOF
+
+cat << EOF | cmp -s "$sample1_out/Solo.out/Gene/filtered/features.tsv" || { echo "Features file is different"; exit 1; }
+gene1	gene1	Gene Expression
+gene2	gene2	Gene Expression
+EOF
+
+cat << EOF | cmp -s "$sample1_out/Solo.out/Gene/filtered/matrix.mtx" || { echo "Matrix file is different"; exit 1; }
+%%MatrixMarket matrix coordinate integer general
+%
+2 1 1
+1 1 1
+EOF
+
+echo ">>> Sample 2"
+assert_file_contains "$sample2_out/Solo.out/Barcodes.stats" "nExactMatch              2"
+assert_file_contains "$sample2_out/Log.final.out" "Uniquely mapped reads number |	2"
+assert_file_contains "$sample2_out/Log.final.out" "Number of input reads |	2"
+
+cat << EOF | cmp -s "$sample2_out/Solo.out/Gene/filtered/barcodes.tsv" || { echo "Barcodes file is different"; exit 1; }
+CGGGTTTACC
+EOF
+
+cat << EOF | cmp -s "$sample2_out/Solo.out/Gene/filtered/features.tsv" || { echo "Features file is different"; exit 1; }
+gene1	gene1	Gene Expression
+gene2	gene2	Gene Expression
+EOF
+
+cat << EOF | cmp -s "$sample2_out/Solo.out/Gene/filtered/matrix.mtx" || { echo "Matrix file is different"; exit 1; }
+%%MatrixMarket matrix coordinate integer general
+%
+2 1 1
+2 1 1
+EOF
+
+echo "> Run test 2 (compressed input)"
+gzip -c "$TMPDIR/sample1_R1.fastq" > "$TMPDIR/sample1_R1.fastq.gz"
+gzip -c "$TMPDIR/sample2_R1.fastq" > "$TMPDIR/sample2_R1.fastq.gz"
+gzip -c "$TMPDIR/sample1_R2.fastq" > "$TMPDIR/sample1_R2.fastq.gz"
+gzip -c "$TMPDIR/sample2_R2.fastq" > "$TMPDIR/sample2_R2.fastq.gz"
+
+run_2_dir="$TMPDIR/run_2"
+mkdir -p "$run_2_dir" 
+pushd "$run_2_dir" > /dev/null
+"$meta_executable" \
+    --input_r1 "$TMPDIR/sample1_R1.fastq.gz;$TMPDIR/sample2_R1.fastq.gz" \
+    --input_r2 "$TMPDIR/sample1_R2.fastq.gz;$TMPDIR/sample2_R2.fastq.gz" \
+    --genomeDir "$TMPDIR/index/" \
+    --barcodes "ACAGTCACAG;CGGGTTTACC" \
+    --wellBarcodesLength 10 \
+    --umiLength 10 \
+    --runThreadN 2 \
+    --output "$TMPDIR/output_gz_*" > /dev/null 2>&1
+popd > /dev/null
+
+echo ">> Check if output directories exists"
+sample1_out="$TMPDIR/output_gz_ACAGTCACAG"
+sample2_out="$TMPDIR/output_gz_CGGGTTTACC"
+assert_directory_exists "$sample1_out"
+assert_directory_exists "$sample2_out"
+
+echo ">> Check if output files have been created"
+for sample in "$sample1_out" "$sample2_out"; do
+  assert_file_exists "$sample/Aligned.sortedByCoord.out.bam" 
+  assert_file_exists "$sample/Unmapped.out.mate1"
+  assert_file_exists "$sample/Unmapped.out.mate2"
+  assert_file_exists "$sample/Log.out"
+  assert_file_exists "$sample/Log.final.out"
+  assert_file_exists "$sample/ReadsPerGene.out.tab"
+done 
+
+
+echo ">> Check if Solo output is present"
+for sample in "$sample1_out" "$sample2_out"; do
+  assert_directory_exists "$sample1_out/Solo.out"
+  assert_directory_exists "$sample1_out/Solo.out/Gene"
+  assert_file_exists "$sample1_out/Solo.out/Barcodes.stats"
+  assert_file_exists "$sample1_out/Solo.out/Gene/raw/barcodes.tsv"
+  assert_file_exists "$sample1_out/Solo.out/Gene/raw/features.tsv"
+  assert_file_exists "$sample1_out/Solo.out/Gene/raw/matrix.mtx"
+  assert_file_exists "$sample1_out/Solo.out/Gene/filtered/barcodes.tsv"
+  assert_file_exists "$sample1_out/Solo.out/Gene/filtered/features.tsv"
+  assert_file_exists "$sample1_out/Solo.out/Gene/filtered/matrix.mtx"
+done
+
+echo ">> Check contents of output"
+echo ">>> Sample 1"
+assert_file_contains "$sample1_out/Solo.out/Barcodes.stats" "nExactMatch              2"
+assert_file_contains "$sample1_out/Log.final.out" "Uniquely mapped reads number |	2"
+assert_file_contains "$sample1_out/Log.final.out" "Number of input reads |	2"
+
+cat << EOF | cmp -s "$sample1_out/Solo.out/Gene/filtered/barcodes.tsv" || { echo "Barcodes file is different"; exit 1; }
+ACAGTCACAG
+EOF
+
+cat << EOF | cmp -s "$sample1_out/Solo.out/Gene/filtered/features.tsv" || { echo "Features file is different"; exit 1; }
+gene1	gene1	Gene Expression
+gene2	gene2	Gene Expression
+EOF
+
+cat << EOF | cmp -s "$sample1_out/Solo.out/Gene/filtered/matrix.mtx" || { echo "Matrix file is different"; exit 1; }
+%%MatrixMarket matrix coordinate integer general
+%
+2 1 1
+1 1 1
+EOF
+
+echo ">>> Sample 2"
+assert_file_contains "$sample2_out/Solo.out/Barcodes.stats" "nExactMatch              2"
+assert_file_contains "$sample2_out/Log.final.out" "Uniquely mapped reads number |	2"
+assert_file_contains "$sample2_out/Log.final.out" "Number of input reads |	2"
+
+cat << EOF | cmp -s "$sample2_out/Solo.out/Gene/filtered/barcodes.tsv" || { echo "Barcodes file is different"; exit 1; }
+CGGGTTTACC
+EOF
+
+cat << EOF | cmp -s "$sample2_out/Solo.out/Gene/filtered/features.tsv" || { echo "Features file is different"; exit 1; }
+gene1	gene1	Gene Expression
+gene2	gene2	Gene Expression
+EOF
+
+cat << EOF | cmp -s "$sample2_out/Solo.out/Gene/filtered/matrix.mtx" || { echo "Matrix file is different"; exit 1; }
+%%MatrixMarket matrix coordinate integer general
+%
+2 1 1
+2 1 1
+EOF
+
+
+echo "> Check that wrong number of barcodes are detected."
+run_3_dir="$TMPDIR/run_3"
+mkdir -p "$run_3_dir" 
+pushd "$run_3_dir" > /dev/null
+set +eo pipefail
+"$meta_executable" \
+    --input_r1 "$TMPDIR/sample1_R1.fastq.gz;$TMPDIR/sample2_R1.fastq.gz" \
+    --input_r2 "$TMPDIR/sample1_R2.fastq.gz;$TMPDIR/sample2_R2.fastq.gz" \
+    --genomeDir "$TMPDIR/index/" \
+    --barcodes "ACAGTCACAG" \
+    --wellBarcodesLength 10 \
+    --umiLength 10 \
+    --runThreadN 2 \
+    --output "$TMPDIR/output_gz_*" > /dev/null 2>&1 && echo "Expected non-zero exit code " && exit 1
+set -eo pipefail
+popd > /dev/null
+
+echo "> Check that missing wildcard character is detected."
+run_4_dir="$TMPDIR/run_4"
+mkdir -p "$run_4_dir" 
+pushd "$run_4_dir" > /dev/null
+set +eo pipefail
+"$meta_executable" \
+    --input_r1 "$TMPDIR/sample1_R1.fastq.gz;$TMPDIR/sample2_R1.fastq.gz" \
+    --input_r2 "$TMPDIR/sample1_R2.fastq.gz;$TMPDIR/sample2_R2.fastq.gz" \
+    --genomeDir "$TMPDIR/index/" \
+    --barcodes "ACAGTCACAG;CGGGTTTACC" \
+    --wellBarcodesLength 10 \
+    --umiLength 10 \
+    --runThreadN 2 \
+    --output "$TMPDIR/output_run4" > /dev/null 2>&1 && echo "Expected non-zero exit code." && exit 1 
+set -eo pipefail
+popd > /dev/null
+
+echo "> Check that a mismatch in the length of the input mates is detected."
+empty_input_file="$TMPDIR/empty.fastq"
+touch "$empty_input_file"
+run_5_dir="$TMPDIR/run_5"
+mkdir -p "$run_5_dir" 
+pushd "$run_5_dir" > /dev/null
+set +eo pipefail
+"$meta_executable" \
+    --input_r1 "$TMPDIR/sample1_R1.fastq;$empty_input_file" \
+    --input_r2 "$TMPDIR/sample1_R2.fastq;$TMPDIR/sample2_R2.fastq" \
+    --genomeDir "$TMPDIR/index/" \
+    --barcodes "ACAGTCACAG;CGGGTTTACC" \
+    --wellBarcodesLength 10 \
+    --umiLength 10 \
+    --runThreadN 2 \
+    --output "$TMPDIR/output_run5_*" > /dev/null 2>&1 && echo "Expected non-zero exit code " && exit 1
+set -eo pipefail
+popd > /dev/null
+
+echo "> Check that wrong number of input files is detected."
+run_6_dir="$TMPDIR/run_6"
+mkdir -p "$run_6_dir" 
+pushd "$run_6_dir" > /dev/null
+set +eo pipefail
+"$meta_executable" \
+    --input_r1 "$TMPDIR/sample1_R1.fastq" \
+    --input_r2 "$TMPDIR/sample1_R2.fastq;$TMPDIR/sample2_R2.fastq" \
+    --genomeDir "$TMPDIR/index/" \
+    --barcodes "ACAGTCACAG;CGGGTTTACC" \
+    --wellBarcodesLength 10 \
+    --umiLength 10 \
+    --runThreadN 2 \
+    --output "$TMPDIR/output_run_6_*" > /dev/null 2>&1 && echo "Expected non-zero exit code " && exit 1
+set -eo pipefail
+popd > /dev/null
+
+
--- a/src/report/OutputSTARsolo.png
+++ b/src/report/OutputSTARsolo.png
--- a/src/report/config.vsh.yaml
+++ b/src/report/config.vsh.yaml
@@ -0,0 +1,73 @@
+name: create_report
+namespace: "report"
+description: |
+  Create a basic QC report in HTML format based on a number of esets.
+authors:
+  - __merge__: /src/base/authors/dries_schaumont.yaml
+    roles: [ maintainer ]
+  - __merge__: /src/base/authors/marijke_van_moerbeke.yaml
+    roles: [ author, maintainer ]
+argument_groups:
+  - name: "Arguments"
+    arguments:
+    - type: file
+      name: "--eset"
+      required: true
+      multiple: true
+    - type: file
+      name: "--output_report"
+      required: true
+      direction: output
+      example: report.html
+resources:
+  - type: r_script
+    path: script.R
+  - type: r_script
+    path: template.Rmd
+  - type: r_script
+    path: plateLayouts.R
+  - path: OutputSTARsolo.png 
+    type: file
+test_resources:
+  - type: r_script
+    path: test.R
+  - path: ./test_data
+engines:
+  - type: docker
+    image: rocker/r2u:24.04
+    setup:
+      - type: apt
+        packages:
+          - procps
+          - pandoc
+      - type: r
+        bioc:
+          - Biobase
+          - ComplexHeatmap
+        cran:
+          - ggplot2
+          - knitr
+          - gridExtra
+          - RColorBrewer
+          - processx
+          - whisker
+          - rmarkdown
+          - bookdown
+          - data.table
+          - platetools
+          - htmltools
+          - DT
+          - logger
+          - bit64
+        script:
+          - install.packages("oaStyle", repos = c(rdepot = "https://repos.openanalytics.eu/repo/public", getOption("repos")))
+    test_setup:
+      - type: r
+        packages:
+          - testthat
+          - R.utils
+runners:
+  - type: executable
+  - type: nextflow
+
+     
--- a/src/report/plateLayouts.R
+++ b/src/report/plateLayouts.R
@@ -0,0 +1,430 @@
+
+#' Displays the annotation of the wells in a plateLayout
+#' @param plateData a data.table object containing the information
+#' of the plate. This must contain a "WellID".
+#' @param plateName The plate name
+#' @param valueVariable The name of the variable in 'plateData' to
+#' be visualized in a plate layout.
+#' @param textVariable The name of the variable in 'plateData' to be
+#' shown in the wells of the plate layout. If NULL, the valueVariable
+#' is shown.
+#' @param colours A named character vector containing the colours
+#' for the different levels of the valuevariable. The names should
+#' correspond to the dose levels. if not specified, a scheme of blues
+#' will be provided.
+#' @param breaks Numeric vector indicating breaks for plot coloring.
+#' @param colourWellText Colour to display the text in the wells.
+#' @param layout Integer vector of length two with number of rows and
+#' colums in a plate, e.g. \code{c(16,24)}
+#' @param legend.title A title for the legend
+#' @param plot.title A title for the plot, will be contracted
+#' with the plate name
+#' @param ... additional arguments for \code{plateLayout.default} function
+#' @import data.table
+#' @importFrom platetools fill_plate
+#' @export
+plateLayout.annotation <- function(
+  plateData,
+  plateName = character(),
+  valueVariable = "Dose",
+  textVariable = NULL,
+  breaks = NULL, colours = NULL,
+  colourWellText = "black",
+  layout = c(16, 24),
+  legend.title = "Dose",
+  plot.title = "Plate Annotation - ",
+  textFontSize = 9, ...
+) {
+  WellID <- Label <- NULL
+
+  if (!(all(c("WellID", "SampleName") %in% colnames(plateData)))) {
+    stop(" 'WellID' and 'SampleName' column required in plateData object")
+  }
+  
+  #Check WellID Format
+  checkWellID <- grepl("^[[:upper:]]{1,2}[[:digit:]]{1,2}$", plateData$WellID)
+  if(!all(checkWellID)){
+    stop("WellID does not have the correct format")
+  }
+
+
+  plateData[, WellID := paste0(
+    sub(".*([[:alpha:]]).+", "\\1", plateData$WellID),
+    sprintf(
+      "%02d", as.numeric(sub(".*[[:alpha:]](.+)", "\\1", plateData$WellID))
+    )
+  )]
+
+  plateData <- platetools::fill_plate(plateData, "WellID", plate = layout[1]*layout[2])
+
+  plateData$column <- factor(
+    sprintf(
+      "%02d",
+      as.numeric(sub(".*[[:alpha:]](.+)", "\\1", plateData$WellID))
+    ),
+    levels = sprintf("%02d", seq(1, layout[2]))
+  )
+  plateData$row <- factor(sub(".*([[:alpha:]]).+", "\\1", plateData$WellID),
+                          levels = LETTERS[seq(1, layout[1])])
+
+  if (!is.null(valueVariable)){
+    plateData[, values := as.character(plateData[, ..valueVariable][[1]])]
+    valueVar <- "values"
+  }else{
+    plateData[, values := "grey"]
+    valueVar <- "values"
+    colours <- setNames("grey", "grey")
+  }
+
+
+  if (is.null(colours)) {
+
+    blues <- colorRampPalette(c("#d6e0ff", "#2171B5"))
+    greens <- colorRampPalette(c("light green", "dark green"))
+
+    numLevels <- sort(as.numeric(as.character(unique(plateData[, values])[
+      grepl(
+        "^[[:digit:]]+([.][[:digit:]]+)?$",
+        trimws(unique(plateData[, values]))
+      )
+    ])))
+    otherLevels <- sort(as.character(unique(plateData[, values])[
+      !grepl(
+        "^[[:digit:]]+([.][[:digit:]]+)?$",
+        trimws(unique(plateData[,values]))
+      )
+    ]))
+
+    colours <- c(blues(length(numLevels)), greens(length(otherLevels)), "red")
+    names(colours) <- c(numLevels, otherLevels, "failed")
+  }
+
+  if (!is.null(textVariable)) {
+    plateData[,
+      Label :=  do.call(paste, c(.SD, sep = "\n ")),
+      .SDcols = textVariable
+    ]
+    plateData[, Label :=  gsub("-", "-\n", Label)]
+    plateData[, Label :=  gsub("_", "_\n", Label)]
+    textVar <- "Label"
+  } else {
+    textVar <- NULL
+  }
+
+
+  if (is.null(breaks)){
+    breaks <- seq_len(length(colours))
+  }
+
+  plateLayout(
+    plateData = plateData, valueVariable = valueVar,
+    textVariable = textVar, plateName = plateName,
+    breaks = breaks, colourWellText = colourWellText,
+    legend.title = legend.title, layout = layout,
+    colours = colours, plot.title = plot.title,
+    textFontSize = textFontSize, ...
+  )
+}
+
+
+
+#' Create a heatmap of values in a plateLayout view. The values can be
+#' library sizes, number of genes, qcScore (0/1) or a factor.
+#' @param plateData A data.table of the values to be visualized with
+#' at least the column of interest (specified in 'varOfInterest')
+#' and a 'WellID' column indicating the wells in the plate. The WellID
+#' is a combination of a letter (row in the plate) and an integer
+#' (column in the plate).
+#' @param valueVariable The name of the variable in 'plateData'
+#' to be visualized in a plate layout
+#' @param textVariable The name of the variable in 'plateData'
+#' to be shown in the wells of the plate layout. Defaults to the
+#' valueVariable and if NULL, no text will be displayed.
+#' @param breaks Numeric vector indicating breaks for plot coloring.
+#' @param colours Colours to be used for levels specified by
+#' the breaks. If NULL, a colour scheme of purples is shown.
+#' @param colourWellText Colour to display the text in the wells.
+#' @param layout Integer vector of length two with number of rows
+#' and colums in a plate, e.g. \code{c(16,24)}
+#' @param makeContourColours Logical, whether or not the plate
+#' layout will contain a contour colours for the wells based on the
+#' parameters in 'contourColours' and 'categories'
+#' @param contourVariable The variable used for the contour colouring
+#' @param contourColours Character vector specifying a colour for
+#' each range in 'categories'
+#' @param labelsCategories Character vector specifying the names
+#' (labels) for each range in 'categories'
+#' @param categories if contour Variable is not a factor, a numeric
+#' vector specifying the categories to divide the 'varOfInterest',
+#' including the lower and upper limits.
+#' @param plateName The plate name
+#' @param plot.title A title for the plot, will be contracted with
+#' the plate name
+#' @param legend.title A title for the legend
+#' @param displayHeatmap Logical, whether to display the plateLayout heatmap
+#' @param saveHeatmap Logical, whether to save the plateLayout heatmap
+#' @param outputDir The directory where the plateLayout heatmap should be saved
+#' @param prefix The prefix to the file name of the saved plateLayout heatmap
+#' @param ... additional arguments for \code{ComplexHeatmap::Heatmap} function
+#' @importFrom platetools fill_plate
+#' @importFrom RColorBrewer brewer.pal
+#' @importFrom ComplexHeatmap Heatmap
+#' @importFrom circlize colorRamp2
+#' @importFrom grid grid.text grid.rect gpar legendGrob	gpar
+#' @importFrom grDevices dev.off png
+#' @importFrom graphics title
+#' @export
+plateLayout <- function(
+  plateData, valueVariable, textVariable = valueVariable,
+  breaks = NULL, colours = NULL, colourWellText = "white", textFontSize = 6,
+  layout = c(16, 24), makeContourColours = FALSE, contourVariable = character(),
+  contourColours = c("red", "orange", "seagreen3"),lwdContours = c(1, 1, 1),
+  labelsCategories = c('1', '2', '3'), categories = NULL, plateName = character(),
+  plot.title = character(), legend.title = NULL, legendFontSize = 15,
+  row_split = rep("A", 16), col_split = rep("A", 24), legendFontSizeTitle = 15,
+  displayHeatmap = TRUE, saveHeatmap = FALSE, outputDir = ".", prefix = ""
+) {
+  WellID <- NULL
+  if (!(all(c("WellID", "SampleName") %in% colnames(plateData)))) {
+    stop(" 'WellID' and 'SampleName' column required in plateData object")
+  }
+
+
+  plateData[, WellID := paste0(
+    sub(".*([[:alpha:]]).+", "\\1", plateData$WellID),
+    sprintf(
+      "%02d",
+      as.numeric(sub(".*[[:alpha:]](.+)", "\\1", plateData$WellID))
+    )
+  )]
+
+  plateData <- platetools::fill_plate(plateData, "WellID", plate = 384)
+
+  plateData$column <- factor(
+    sprintf("%02d", as.numeric(
+      sub(".*[[:alpha:]](.+)", "\\1", plateData$WellID)
+    )),
+    levels = sprintf("%02d", seq(1, layout[2]))
+  )
+  plateData$row <- factor(sub(".*([[:alpha:]]).+", "\\1", plateData$WellID),
+                          levels = LETTERS[seq(1, layout[1])])
+
+
+  plateValues <- plateLayoutFormat(
+    plateData,
+    varOfInterest = valueVariable,
+    rows = layout[1],
+    cols = layout[2]
+  )
+  if (!is.null(textVariable)) {
+    plateText <- plateLayoutFormat(
+      plateData, varOfInterest = textVariable,
+      rows = layout[1],
+      cols = layout[2]
+    )
+  }
+  plot.title <- gsub(
+    "^([a-z])", "\\U\\1",
+    gsub("([A-Z])", " \\1",
+    plot.title, perl = TRUE), perl = TRUE
+  )
+  mainTitle <- paste0(plot.title, plateName)
+  plateContourColours <- matrix("", nrow = layout[1], ncol = layout[2])
+
+  if (makeContourColours) {
+    contourData <- plateData[WellType %in% c("nonEmpty", "Treated Wells"), ]
+
+    if (is.numeric(contourData[, ..contourVariable][[1]])) {
+      contourData$contours <- cut(
+        contourData[, ..contourVariable][[1]],
+        categories, left = TRUE,
+        right = TRUE,
+        labels = labelsCategories)
+    }
+    else {
+      contourData$contours <- contourData[, ..contourVariable][[1]]
+    }
+    names(contourColours) <- labelsCategories
+    names(lwdContours) <- labelsCategories
+    for (i in seq_len(layout[1])) {
+      for (j in seq_len(layout[2])) {
+        tryCatch({
+          sampleHit <- which(
+            as.character(contourData$WellID) == paste0(
+              LETTERS[i], sprintf("%02d", j)
+            )
+          )
+          if (length(sampleHit) == 1) {
+            plateContourColours[i, j] <- as.character(
+              contourData[sampleHit,'contours'][[1]]
+            )
+          }
+        },
+        error = function(e) {
+          print(paste0(LETTERS[i], sprintf("%02d", j), " is missing."))
+        }
+        )
+      }
+    }
+  }
+
+  plateValues$contours <- plateContourColours
+  colnames(plateValues$values) <- seq_len(ncol(plateValues$values))
+
+  if (is.null(breaks)) {
+    breakValues <- plateValues$values
+    breakValues[which(is.na(breakValues))] <- 0
+    if (all(breakValues >= 0)) {
+      breaks <- computeBreaks(7, max(plateValues$values, na.rm = TRUE))
+    } else {
+      breaks <- quantile(plateValues$values,  probs = seq(0, 1, 0.125))
+    }
+  }
+
+  if (is.null(colours)) {
+    colours <- tryCatch({
+      colorRamp2(
+        breaks = breaks,
+        colors = brewer.pal(length(breaks), "Purples")
+      )
+    },
+    error = function(cond) {
+      return(c("#9370DB", "white"))
+    })
+  }
+  ht <- Heatmap(
+    plateValues$values,
+    column_title = mainTitle, column_title_side = "top",
+    rect_gp = gpar(lwd = 0.4),
+    cluster_rows = FALSE, cluster_columns = FALSE,
+    col = colours, row_title = NULL,
+    row_split = row_split, column_split = col_split,
+    row_names_side = "left",
+    cluster_row_slices = FALSE,
+    cluster_column_slices = FALSE,
+    show_heatmap_legend = TRUE,
+    heatmap_legend_param = list(
+      title = ifelse(
+        is.null(legend.title),
+        paste0(valueVariable, "\n"),
+        paste0(legend.title, "\n")
+      ),
+      grid_height = unit(9, "mm"), border = "black",
+      labels_gp = gpar(fontsize = legendFontSize),
+      title_gp = gpar(fontsize = legendFontSizeTitle)
+    ),
+    cell_fun = function(j, i, x, y, width, height, fill) {
+      if (is.na(plateValues$values[i, j])) {
+        grid.rect(
+          x, y, width, height,
+          gp = gpar(fill = "white", alpha = 0.7, lwd = 0.7, col = "white")
+        )
+      }
+      else if (!is.null(textVariable)) {
+        grid.text(
+          plateText$values[i, j], x, y,
+          just = "centre",
+          gp = gpar(fontsize = textFontSize, col = colourWellText)
+        )
+      }
+      if (makeContourColours) {
+        if (!is.na(plateValues$contours[i, j])) {
+          grid.rect(
+            x, y, width, height,
+            gp = gpar(
+              col = contourColours[as.character(plateValues$contours[i, j])],
+              fill = NA,
+              lwd = lwdContours[as.character(plateValues$contours[i, j])]
+            )
+          )
+        }
+      }
+    }
+  )
+
+  if (displayHeatmap) {
+    print(ht)
+  }
+  if (saveHeatmap) {
+    png(
+      file.path(
+        outputDir,
+        paste0(prefix,gsub(" |-", "",plot.title), "_", plateName, ".png")
+      ),
+      width = 30, height = 10, units = "cm", res = 1200
+    )
+    print(ht)
+    dev.off()
+  }
+
+  return(ht)
+}
+
+
+#' Return numerical matrix with number of reads that corresponds to the
+#' plate layout
+#' @param data A data.frame of the values to be visualized with at least
+#' the columnof interest (specified in 'varOfInterest') and a 'WellID' column
+#' indicating the wells in the plate. The WellID is a combination of a
+#' letter (row in the plate) and an integer (column in the plate).
+#' @param varOfInterest The name of the variable in 'data' to be visualized
+#' in a plate layout
+#' @param rows number of rows in a plate layout
+#' @param cols number of columns in a plate layout
+#' @param verbose if \code{TRUE}, samples missing from the plate
+#' will be reported
+#' @export
+plateLayoutFormat <- function(
+  data, varOfInterest,
+  rows = 16, cols = 24,
+  verbose = FALSE
+) {
+  plateValues <- matrix(NA, nrow = rows, ncol = cols)
+  for (i in seq_len(rows)) {
+    for (j in seq_len(cols)) {
+      tryCatch({
+        sampleHit <- which(
+          as.character(data$WellID) == paste0(LETTERS[i], sprintf("%02d", j))
+        )
+        if(length(sampleHit) == 1){
+          plateValues[i, j] <- data[sampleHit, ..varOfInterest][[1]]
+        }
+      },
+      error = function(e) {
+        if (verbose == TRUE) {
+          print(paste0(LETTERS[i], sprintf("%02d", j), " is missing."))
+        }
+      }
+      )
+    }
+  }
+
+  row.names(plateValues) <- LETTERS[1:rows]
+  return(list("values" = plateValues))
+}
+
+
+
+#' Helper function to automate break selection for raw count data
+#'
+#' This function creates an exponentially increasing vector for given number
+#' breaks between zero and some element of choice. It is particularly useful for
+#' raw counts or raw counts per million.
+#'
+#' @param nBreaks Number of breaks to be generated
+#' @param maxElement Maximum value of data entries
+#' @export
+computeBreaks <- function(nBreaks, variable) {
+
+  maxElement <- max(variable, na.rm = TRUE)
+  if (length(unique(variable)) == 1) {
+    breaks <-  c(0, 0.5, ifelse(maxElement < 1, 1,  maxElement))
+  } else {
+    coefSystem <- solve(
+      rbind(c(1, 1), c(1, (nBreaks - 1)))) %*% c(0, log(maxElement)
+    )
+    coefExp <- c(exp(coefSystem[1]), coefSystem[2])
+    breaks <- coefExp[1] * exp((1:(nBreaks - 1)) * coefExp[2])
+  }
+  return(c(0, breaks))
+}
--- a/src/report/script.R
+++ b/src/report/script.R
@@ -0,0 +1,33 @@
+library(whisker)
+library(logger)
+
+log_info("Setting temporary directory to: {meta$temp_dir}")
+Sys.setenv(TMP = meta$temp_dir)
+temp_folder <- tempdir(check = TRUE)
+log_info("Created temporary directory {temp_folder}")
+
+template <- file.path(meta$resources_dir, "template.Rmd")
+
+esets_normalized <- lapply(par$eset, function(eset_path) {
+  return(file.path(normalizePath(dirname(eset_path)), basename(eset_path)))
+})
+
+log_info(paste0(
+  "Rendering markdown {template} to HTML ",
+  "{par$output_report} with esets {paste(esets_normalized, collapse = ', ')}"
+))
+
+rmarkdown::render(
+  normalizePath(template),
+  output_file = basename(par$output_report),
+  output_dir = dirname(par$output_report),
+  runtime = "static",
+  intermediates_dir = par$report_dir,
+  clean = TRUE,
+  params = list(
+    esets = esets_normalized,
+    outputDir = par$report_dir
+  )
+)
+
+log_info("Done")
--- a/src/report/template.Rmd
+++ b/src/report/template.Rmd
@@ -0,0 +1,977 @@
+---
+title: "Exploratory Data Report"
+date: "`r format(Sys.time(), '%d %B, %Y')`"
+editor_options:
+  chunk_output_type: console
+output: 
+  oaStyle::html_report
+# parameters which are overwritten by the script
+params:
+  outputDir: 'output/'
+  esets:
+    - sample1.rds
+    - sample2.rds
+---
+
+<!---
+Copy this template in your working directory (where you want to run the report).
+This template can be used as a starting document to run a preliminary DRUGseq report
+-->
+
+<!---
+Use full page width
+-->
+
+<style type="text/css">
+div.main-container {
+  max-width: 1600px !important;
+  margin-left: auto;
+  margin-right: auto;
+}
+</style>
+
+
+
+```{r params, eval = TRUE, include = FALSE}
+outputDir <- params$outputDir
+esets <- params$esets
+```
+
+
+```{r outputDir, echo = FALSE}
+## Required: ABSOLUTE outputDir
+outputDir <- file.path(outputDir)
+
+# When working on a windows computer it should be
+# "/Users/..." instead of "C:/Users/..."
+if (.Platform$OS.type == "windows") {
+  outputDir <- paste0(
+    "/",
+    paste(
+      unlist(strsplit(outputDir, split = "/"))[-1], collapse = "/"
+    ),
+    "/"
+  )
+}
+```
+
+
+
+
+```{r optionsChunkDoNotModify, echo = FALSE, message = FALSE, warning=FALSE}
+
+## Chunk with options for knitr. This chunk should not be modified.
+knitr::opts_chunk$set(
+  eval = TRUE,
+  echo = FALSE, 
+  message = FALSE,
+  cache = FALSE,
+  warning = FALSE,
+  error = FALSE,
+  comment = NA, #"#",
+  tidy = FALSE,
+  collapse = TRUE,
+  out.width = "100%",
+  fig.width = 20,
+  fig.height = 10,
+  results = "asis")
+
+knitr::opts_knit$set(root.dir = getwd())
+
+options(warn = 1, width = 200)
+
+```
+
+```{r libraries_and_functions}
+source("plateLayouts.R")
+library(ComplexHeatmap)
+library(data.table)
+library(ggplot2)
+library(knitr)
+library(Biobase)
+library(gridExtra)
+library(RColorBrewer)
+```
+
+
+```{r dataImport}
+
+# Create esetList
+esetList <- sapply(
+  esets, simplify = FALSE,
+  USE.NAMES = TRUE,
+  function(eset_raw) {
+    if (!file.exists(eset_raw)) {
+      stop(paste0("Provided path '", eset_raw, "' is not a file."))
+    }
+    eset <- readRDS(eset_raw)
+  }
+)
+pools <- sapply(esetList, function(eset) {
+  unique(eset$PoolName)
+})
+names(esetList) <- unlist(pools)
+
+# Create qcData
+pDataList <- lapply(esetList, function(eset) data.table(pData(eset)))
+qcData <- rbindlist(pDataList, fill = TRUE)
+
+textVars <- "SampleName"
+annotationVar <- "PoolName"
+
+if (!"SampleName" %in% names(qcData)) {
+  qcData[, SampleName := paste0(PoolName, "_", WellBC)]
+}
+qcData[, log10LibSize := round(log10(NumberOfInputReads))]
+qcData[, (annotationVar) := lapply(.SD, as.factor), .SDcols = annotationVar]
+
+
+colourList <- list()
+Design_levels <- sort(
+  as.character(unique(qcData[, ..annotationVar][[1]])),
+  decreasing = TRUE
+)
+
+if (length(Design_levels) == 1) {
+  colours <- c("#d6e0ff", "lightgrey")
+  names(colours) <- c(Design_levels, "Empty")
+    colourList[[annotationVar]] <- list(
+      "colours" = colours,
+      "annotVar" = annotationVar,
+      "text" = textVars
+    )
+}else if (length(Design_levels) == 2) {
+  colours <- c("#d6e0ff", "#FF9999")
+
+  names(colours) <- c(Design_levels)
+  colourList[[annotationVar]] <- list(
+    "colours" = colours,
+    "annotVar" = annotationVar,
+    "text" = textVars
+  )
+} else if (length(Design_levels) <= 20) {
+
+  if (length(Design_levels) > 12) {
+    colours <- c(
+      brewer.pal(12, "Set3"),
+      brewer.pal((length(Design_levels) - 12),
+      "Pastel2")
+    )
+  } else {
+    colours <- c(brewer.pal(length(Design_levels), "Set3"))
+  }
+
+  names(colours) <- c(Design_levels)
+  colourList[[annotationVar]] <- list(
+    "colours" = colours,
+    "annotVar" = annotationVar,
+    "text" = textVars
+  )
+} else {
+  colours <- c("#d6e0ff")
+  names(colours) <- c("nonEmpty")
+  colourList[[annotVar]] <- list(
+    "colours" = colours,
+    "annotVar" = annotVar,
+    "text" = annotVar
+  )
+}
+```
+
+# Pool Description
+
+Per pool within this study, there are several pool layout plots shown, based on the
+
+* number of STAR input reads (= library size)
+
+* log10 transformed number of STAR input reads
+
+* number of detected UMIs
+
+* number of detected genes
+
+* number of chromosomal reads 
+
+* percentage of ERCC 
+
+* percentage of mitochondria 
+
+
+> The values for the different samples within each pool is expected to be comparable if the content of the different pools is equally diverse.
+
+```{r plateAnnotation, out.width = "100%",fig.width = 20, fig.height= 10}
+
+plateVars <- c("NumberOfInputReads", "log10LibSize", "NumberOfMappedReads",
+               "NumberOfChromReads", "NumberOfUMIs", "NumberOfGenes",
+               "pctMT", "pctERCC")
+
+breaksVars <- lapply(
+  plateVars,
+  function(var) {
+    computeBreaks(7, qcData[, ..var])
+  }
+)
+names(breaksVars) <- plateVars
+
+for (pool in pools){
+  cat("\n\n")
+  cat(paste0("## ", pool, " {.tabset} \n\n"))
+  poolData <- qcData[PoolName == pool]
+  lapply(plateVars, function(plateVar) {
+    cat("\n\n")
+    cat(sprintf("### %s {.unnumbered}", plateVar))
+    cat("\n\n")
+    plateLayout(
+      poolData, valueVariable = plateVar,
+      textFontSize = 10, legendFontSize = 12,
+      plateName = pool, plot.title = "libSize - ",
+      legend.title = "libSize", breaks = breaksVars[[plateVar]]
+    )
+    cat("\n\n")
+  })
+  cat("\n\n")
+}
+```
+
+<br>
+
+
+# Data Distributions
+
+
+## Reads Distributions {.tabset}
+
+The 4 box plots below represent the distributions per pool of the different samples based on:
+
+* the number of STAR input reads
+
+* the number of STAR mapped reads
+
+* the percentage of STAR mapped reads
+
+* the number of detected genes
+
+> The distributions contribute to the QC metrics mentioned in Par 3. The higher these values, the better.
+> The data range for the different plates is expected to be comparable if the content of the different plates is equally diverse.
+
+
+### Number of Input Reads {.tabset .unnumbered}
+
+```{r settings_1}
+
+nColPlots = 1
+figHeight = 7
+
+```
+
+#### Distribution {.tabset .unnumbered}
+
+
+```{r boxplots_input_plate, fig.height = figHeight}
+ggplot(
+  qcData,
+  aes(
+    x = PoolName,
+    y = NumberOfInputReads, colour = PoolName
+  )
+) + geom_boxplot() + ylab("Number of Input Reads") +
+  ggtitle("Number of Input Reads") +
+  theme(
+    strip.text.x = element_text(size = 20),
+    panel.spacing = unit(2, "lines"),
+    text = element_text(size = 10),
+    axis.text.y = element_text(angle = 90, size = 14),
+    plot.title = element_text(size = 18),
+    legend.text = element_text(size = 15),
+    legend.title = element_text(size = 17),
+    axis.title = element_text(size = 15),
+    axis.text.x = element_blank(),
+    axis.ticks.x = element_blank()
+  )
+
+```
+
+
+### Number of Mapped Reads {.tabset .unnumbered}
+
+#### Distribution {.unnumbered}
+
+```{r boxplots_mapped_plate, fig.height = figHeight}
+
+ggplot(
+  qcData,
+  aes(x = PoolName, y = NumberOfMappedReads, colour = PoolName)
+) + geom_boxplot() + ylab("Number of Mapped Reads") +
+  ggtitle("Number of Mapped Reads") +
+  theme(
+    strip.text.x = element_text(size = 20),
+    panel.spacing = unit(2, "lines"),
+    text = element_text(size = 10),
+    axis.text.y = element_text(angle = 90, size = 14),
+    plot.title = element_text(size = 18),
+    legend.text = element_text(size = 15),
+    legend.title = element_text(size = 17),
+    axis.title.y = element_text(size = 15),
+    axis.text.x = element_blank(),
+    axis.ticks.x = element_blank()
+  )
+```
+
+
+#### pct Mapped Reads {.unnumbered}
+
+```{r boxplots_pctMapped_plate, fig.height = figHeight}
+ggplot(
+  qcData,
+  aes(x = PoolName, y = PctMappedReads, colour = PoolName)
+) +
+  geom_boxplot() +
+  ylab("pct Mapped Reads") +
+  ggtitle("pct Mapped Reads") +
+  theme(
+    strip.text.x = element_text(size = 20),
+    panel.spacing = unit(2, "lines"),
+    text = element_text(size = 10),
+    axis.text.y = element_text(angle = 90, size = 14),
+    plot.title = element_text(size = 18),
+    legend.text = element_text(size = 15),
+    legend.title = element_text(size = 17),
+    axis.title.y = element_text(size = 15),
+    axis.text.x = element_blank(),
+    axis.ticks.x = element_blank()
+  )
+```
+
+### Number of Chromosomal Reads {.tabset .unnumbered}
+
+#### Distribution {.unnumbered}
+
+```{r boxplots_chrom_plate, fig.height = figHeight}
+
+ggplot(
+  qcData,
+  aes(x = PoolName, y = NumberOfChromReads, colour = PoolName)
+) + geom_boxplot() + ylab("Number of Chromosomal Reads") +
+  ggtitle("Number of Chromosomal Reads") +
+  theme(
+    strip.text.x = element_text(size = 20),
+    panel.spacing = unit(2, "lines"),
+    text = element_text(size = 10),
+    axis.text.y = element_text(angle = 90, size = 14),
+    plot.title = element_text(size = 18),
+    legend.text = element_text(size = 15),
+    legend.title = element_text(size = 17),
+    axis.title.y = element_text(size = 15),
+    axis.text.x = element_blank(),
+    axis.ticks.x = element_blank()
+  )
+
+```
+
+#### pct Chromosomal Reads {.unnumbered}
+
+```{r boxplots_pctChrom_plate, fig.height = figHeight}
+
+ggplot(
+  qcData,
+  aes(x = PoolName, y = pctChrom, colour = PoolName)
+) + geom_boxplot() + ylab("pct Chromosomal Reads") +
+  ggtitle("pct Chromosomal Reads") +
+  theme(
+    strip.text.x = element_text(size = 20),
+    panel.spacing = unit(2, "lines"),
+    text = element_text(size = 10),
+    axis.text.y = element_text(angle = 90, size = 14),
+    plot.title = element_text(size = 18),
+    legend.text = element_text(size = 15),
+    legend.title = element_text(size = 17),
+    axis.title.y = element_text(size = 15),
+    axis.text.x = element_blank(),
+    axis.ticks.x = element_blank()
+  )
+```
+
+### Number of UMIs {.tabset .unnumbered}
+
+#### Distribution {.tabset .unnumbered}
+
+
+```{r boxplots_umi_plate, fig.height = figHeight}
+
+ggplot(
+  qcData,
+  aes(x = PoolName, y = NumberOfUMIs, colour = PoolName)
+) + geom_boxplot() + ylab("Number of UMIs") +
+  ggtitle('Number of UMIs') +
+  theme(
+    strip.text.x = element_text(size = 20),
+    panel.spacing = unit(2, "lines"),
+    text = element_text(size = 10),
+    axis.text.y = element_text(angle = 90, size = 14),
+    plot.title = element_text(size = 18),
+    legend.text = element_text(size = 15),
+    legend.title = element_text(size = 17),
+    axis.title = element_text(size = 15),
+    axis.text.x = element_blank(),
+    axis.ticks.x = element_blank()
+  )
+
+```
+
+#### Density distribution {.unnumbered}
+
+```{r density_numberOfUMIs}
+
+## Pre-filtering data exploration
+dt_plot <- melt(
+  qcData,
+  id.vars = c("SampleName", "PoolName", "WellID"),
+  measure.vars = c("NumberOfInputReads", "NumberOfMappedReads", "NumberOfUMIs")
+)
+
+readsDensity_plot <- ggplot(dt_plot, aes(value))
+readsDensity_plot <- readsDensity_plot +
+  geom_density(aes(fill = variable), alpha=0.8) +
+  facet_grid(~ PoolName, scales = "free_x", space = "fixed", drop = TRUE) +
+  geom_vline(
+    xintercept = 5e5,
+    linetype = "dashed",
+    color = "steelblue3", size = 2
+  ) +
+  annotate(
+    "text",
+    x = 3.5e5, y = 2e-6, label = "500k",
+    angle = 90, color = "steelblue3", size = 10
+  ) +
+  geom_vline(
+    xintercept = 1.5e6, linetype = "dashed",
+    color = "forestgreen", size = 2
+  ) +
+  annotate(
+    "text", x = 1.35e6, y = 2e-6, label = "1.5M",
+    angle = 90, color = "forestgreen", size = 10
+  ) +
+  labs(
+    title = "Density plot",
+    subtitle = paste0(
+      "# Samples with NumberOfMappedReads > 1.5M: ",
+      length(which(qcData$NumberOfMappedReads > 1.5e6)),
+      "\n# Samples with NumberOfUMIs > 500k: ",
+      length(which(qcData$NumberOfUMIs > 5e5))
+    ),
+    caption = paste0("# Total samples (after removing empty): ", nrow(qcData)),
+    x = "Count",
+    fill = "Variable"
+  ) +
+  theme(
+    strip.text.x = element_text(size = 20),
+    panel.spacing = unit(2, "lines"),
+    text = element_text(size = 5),
+    axis.text.x = element_text(angle = 90, size = 14),
+    plot.title = element_text(size = 18),
+    plot.subtitle = element_text(size = 17),
+    plot.caption = element_text(size = 15),
+    legend.text = element_text(size = 15),
+    legend.title = element_text(size = 17),
+    axis.title = element_text(size = 15),
+    axis.text.y = element_blank(),
+    axis.ticks.y = element_blank(),
+    axis.title.y = element_blank()
+  )
+readsDensity_plot
+
+```
+
+### Number of Genes {.tabset .unnumbered}
+
+#### Distribution {.unnumbered}
+
+```{r boxplots_genes_plate, fig.height = figHeight}
+ggplot(
+  qcData,
+  aes(x = PoolName, y = NumberOfGenes, colour = PoolName)
+) +
+  geom_boxplot() + ylab("Number of Genes") + 
+  ggtitle("Number of Genes") + 
+  theme(
+    strip.text.x = element_text(size = 20),
+    panel.spacing = unit(2, "lines"),
+    text = element_text(size = 10),
+    axis.text.y = element_text(angle = 90, size = 14),
+    plot.title = element_text(size = 18),
+    legend.text = element_text(size = 15),
+    legend.title = element_text(size = 17),
+    axis.title.y = element_text(size = 15),
+    axis.text.x = element_blank(),
+    axis.ticks.x = element_blank()
+  )
+```
+
+## {.tabset .toc-ignore .unnumbered}
+
+
+In addition, several plots are shown visualizing the efficiency of the reads-to-genes translation:
+
+* the number of input reads vs the number of mapped reads
+
+* the number of chromosomal reads vs the number of mapped reads
+
+* the number of mapped reads per UMI vs the number of mapped reads 
+
+* the number of UNI vs the number of mapped reads
+
+* the number of mapped reads vs the number of genes
+
+* the number of chromosomal reads vs the number of genes
+
+* the number of mapped reads per UMI vs the number of genes 
+
+### Mapping Efficiency {.tabset .unnumbered}
+
+#### Number of Input Reads {.unnumbered}
+
+```{r mapping_efficiency_1_plate, fig.height = 7}
+
+ggplot(
+  qcData,
+  aes(x = NumberOfInputReads, y = NumberOfMappedReads, colour = PoolName)
+) + 
+  geom_point() +
+  xlab("Number of Input Reads") +
+  ylab("Number of Mapped Reads") +
+  ggtitle("Number of Mapped Reads vs Number of Input Reads") + 
+  theme(
+    strip.text.x = element_text(size = 20),
+    panel.spacing = unit(2, "lines"),
+    text = element_text(size = 10),
+    axis.text = element_text(angle = 90, size = 15),
+    plot.title = element_text(size = 18),
+    legend.text = element_text(size = 15),
+    legend.title = element_text(size = 17),
+    axis.title = element_text(size = 15)
+  )
+
+```
+
+
+#### Number of Chromosomal Reads {.unnumbered}
+
+```{r mapping_efficiency_2_plate, fig.height = 7}
+
+ggplot(
+  qcData,
+  aes(x = NumberOfChromReads, y = NumberOfMappedReads, colour = PoolName)
+) + geom_point() +
+  xlab("Number of Chromosomal Reads") + ylab("Number of Mapped Reads") +
+  ggtitle("Number of Chromosomal Reads vs Number of Mapped Reads") +
+  theme(
+    strip.text.x = element_text(size = 20),
+    panel.spacing = unit(2, "lines"),
+    text = element_text(size = 10),
+    axis.text = element_text(angle = 90, size = 15),
+    plot.title = element_text(size = 18),
+    legend.text = element_text(size = 15),
+    legend.title = element_text(size = 17),
+    axis.title = element_text(size = 15)
+)
+
+```
+
+
+#### Number of UMI {.unnumbered}
+
+```{r mapping_efficiency_4_plate, fig.height = 7}
+
+ggplot(
+  qcData,
+  aes(x =NumberOfUMIs, y =  NumberOfMappedReads, colour = PoolName)
+) + geom_point() +
+  ylab("Number of Mapped Reads") + xlab("Number of UMIs ") +
+  ggtitle("Number of UMIs vs Number of Mapped Reads") +
+  theme(
+    strip.text.x = element_text(size = 20),
+    panel.spacing = unit(2, "lines"),
+    text = element_text(size = 10),
+    axis.text = element_text(angle = 90, size = 15),
+    plot.title = element_text(size = 18),
+    legend.text = element_text(size = 15),
+    legend.title = element_text(size = 17),
+    axis.title = element_text(size = 15)
+  )
+
+```
+
+### Counting Efficiency {.tabset .unnumbered}
+
+#### Number of Mapped Reads {.unnumbered}
+
+```{r gene_efficiency_1_plate, fig.height = 7} 
+ggplot(
+  qcData,
+  aes(x = NumberOfMappedReads, y = NumberOfGenes, colour = PoolName)
+) + geom_point() +
+  ylab("Number of Genes") + xlab("Number of Mapped Reads") +
+  ggtitle("Number of Genes vs Number of Mapped Reads") +
+  theme(
+    strip.text.x = element_text(size = 20),
+    panel.spacing = unit(2, "lines"),
+    text = element_text(size = 10),
+    axis.text = element_text(angle = 90, size = 15),
+    plot.title = element_text(size = 18),
+    legend.text = element_text(size = 15),
+    legend.title = element_text(size = 17),
+    axis.title = element_text(size = 15)
+  )
+```
+
+#### Number of Chromosomal Reads {.unnumbered}
+
+```{r gene_efficiency_2_plate, fig.height = 7} 
+ggplot(
+  qcData,
+  aes(x = NumberOfChromReads, y = NumberOfGenes, colour = PoolName)
+) + geom_point() +
+  ylab("Number of Genes") + xlab("Number of Chromosomal Reads") +
+  ggtitle("Number of Genes vs Number of Chromosomal Reads") +
+  theme(
+    strip.text.x = element_text(size = 20),
+    panel.spacing = unit(2, "lines"),
+    text = element_text(size = 10),
+    axis.text = element_text(angle = 90, size = 15),
+    plot.title = element_text(size = 18),
+    legend.text = element_text(size = 15),
+    legend.title = element_text(size = 17),
+    axis.title = element_text(size = 15)
+  )
+```
+
+
+
+## Sequencing Saturation {.tabset}
+
+The barplots below represent the sequencing saturation per sample as determined by STAR, split per pool. 
+The HT-RNAseq platform aims for shallow sequencing resulting in relatively low sequencing saturations of 10-20%.
+In addition, the sequencing saturation vs the number of input reads is shown.
+
+### Sequencing Saturation {.unnumbered}
+
+
+
+```{r sequencingSaturation, fig.height = figHeight}
+
+ggplot(
+  qcData,
+  aes(x = WellID, y = SequencingSaturation, fill = PoolName)
+) + geom_bar(stat = "identity", position = "dodge") +
+  xlab("Samples") + ggtitle("Sequencing Saturation per Sample") +
+  theme(
+    strip.text.x = element_text(size = 20),
+    panel.spacing = unit(1, "lines"),
+    text = element_text(size = 10),
+    plot.title = element_text(size = 18),
+    legend.text = element_text(size = 15),
+    legend.title = element_text(size = 17),
+    axis.title = element_text(size = 15),
+    axis.text.x = element_blank(),
+    axis.text.y = element_text(size = 15),
+    axis.ticks.x = element_blank()
+  )
+```
+
+### Sequencing Saturation - Input Reads {.unnumbered}
+
+
+```{r sequencingSaturation_inputReads, fig.height = figHeight}
+
+
+ggplot(
+  qcData,
+  aes(x = NumberOfInputReads, y = SequencingSaturation, colour = PoolName)
+) + geom_point() +
+  ggtitle("Sequencing Saturation vs Number of Input Reads") +
+  theme(strip.text.x = element_text(size = 20),
+    panel.spacing = unit(2, "lines"),
+    text = element_text(size = 10),
+    axis.text = element_text(angle = 90, size = 15),
+    plot.title = element_text(size = 18),
+    legend.text = element_text(size = 15),
+    legend.title = element_text(size = 17),
+    axis.title = element_text(size = 15)
+  )
+```
+
+### Sequencing Saturation  - Mapped Reads {.unnumbered}
+
+```{r sequencingSaturation_mappedReads, fig.height = figHeight}
+ggplot(
+  qcData,
+  aes(x = NumberOfChromReads, y = SequencingSaturation, colour = PoolName)
+) + geom_point() +
+  ggtitle("Sequencing Saturation vs Number of Chromosomal Reads") +
+  theme(
+    strip.text.x = element_text(size = 20),
+    panel.spacing = unit(2, "lines"),
+    text = element_text(size=10),
+    axis.text = element_text(angle = 90, size = 15),
+    plot.title = element_text(size=18),
+    legend.text = element_text(size = 15),
+    legend.title = element_text(size = 17),
+    axis.title = element_text(size = 15)
+)
+```
+
+<br>
+
+## Genomic Origin {.tabset} 
+
+The 3 boxplots below represent, per pool, the distributions of the percentage of reads mapping to:
+
+* chromosomal regions
+
+* mitochondrial regions
+
+* ERCC spike-ins
+
+The 4th plot summarises the above results across samples per pool.
+
+The 5th plot shows the percentage of reads mapped to the transcriptome (as counted by STAR).  This  measurement serves as a proxy for the percentage of reads mapped to exons.
+
+> The percentage ERCC contributes to the QC metrics mentioned in Par 3. This value is ideally as low as possible (but non-zero to ensure the they have been spiked in) and comparable for the different pools.
+
+
+
+
+### pctChrom {.tabset .unnumbered}
+
+
+```{r genomicOrigin_chrom_plate, fig.height = figHeight}
+
+ggplot(
+  qcData, aes(x = PoolName, y = pctChrom, colour = PoolName)
+) +
+  geom_boxplot() +
+  ggtitle("pctChrom") +
+  theme(
+    strip.text.x = element_text(size = 20),
+    panel.spacing = unit(2, "lines"),
+    text = element_text(size = 10),
+    axis.text.y = element_text(angle = 90, size = 14),
+    plot.title = element_text(size = 18),
+    legend.text = element_text(size = 15),
+    legend.title = element_text(size = 17),
+    axis.title.y = element_text(size = 15),
+    axis.text.x = element_blank(),
+    axis.ticks.x = element_blank()
+  )
+```
+
+
+### pctMT {.tabset .unnumbered}
+
+```{r genomicOrigin_mt_plate, fig.height = figHeight}
+
+ggplot(
+  qcData,
+  aes(x = PoolName, y = pctMT, colour = PoolName)
+) +
+  geom_boxplot() + ggtitle("pctMT") +
+  theme(
+    strip.text.x = element_text(size = 20),
+    panel.spacing = unit(2, "lines"),
+    text = element_text(size = 10),
+    axis.text.y = element_text(angle = 90, size = 14),
+    plot.title = element_text(size = 18),
+    legend.text = element_text(size = 15),
+    legend.title = element_text(size = 17),
+    axis.title.y = element_text(size = 15),
+    axis.text.x = element_blank(),
+    axis.ticks.x = element_blank()
+  )
+```
+
+### pctERCC {.tabset .unnumbered}
+
+
+```{r genomicOrigin_ercc_plate, fig.height = figHeight}
+ggplot(qcData, aes(x = PoolName, y = pctERCC, colour = PoolName))  +
+  geom_boxplot() +
+  ggtitle("pctERCC") +
+  theme(
+    strip.text.x = element_text(size = 20),
+    panel.spacing = unit(2, "lines"),
+    text = element_text(size = 10),
+    axis.text.y = element_text(angle = 90, size = 14),
+    plot.title = element_text(size = 18),
+    legend.text = element_text(size = 15),
+    legend.title = element_text(size = 17),
+    axis.title.y = element_text(size = 15),
+    axis.text.x = element_blank(),
+    axis.ticks.x = element_blank()
+  )
+```
+
+
+### Genomic Summary {.tabset .unnumbered}
+
+
+
+```{r genomicOrigin_summary_plate}
+meanPctChromMTData <- qcData[, .(
+  "pctChrom" = median(pctChrom),
+  "pctMT" = median(pctMT),
+  "pctERCC" = median(pctERCC)
+), by = PoolName]
+meanPctChromMTDataLong <- melt(
+  meanPctChromMTData,
+  id.vars = "PoolName",
+  measure.vars = c("pctChrom", "pctMT", "pctERCC"),
+  variable.name = "Origin", value.name = "pct"
+)
+ggplot(
+  meanPctChromMTDataLong,
+  aes(fill = Origin, y = pct, x = PoolName)) +
+  geom_bar(position = "stack", stat = "identity") +
+  ggtitle("Genomic Origin") +
+  theme(
+    text = element_text(size = 10),
+    axis.text = element_text(angle = 90, size = 15),
+    plot.title = element_text(size = 18),
+    legend.text = element_text(size = 15),
+    legend.title = element_text(size = 17),
+    axis.title = element_text(size = 15)
+  )
+
+```
+
+
+
+# Depletion {.tabset}   
+
+<div align="center">
+```{r depletion}
+
+
+for (eset_name in pools) {
+  cat("\n\n")
+  cat(paste0("## ", eset_name, " {.unnumbered}"))
+  cat("\n\n")
+
+  eset <- esetList[[eset_name]]
+  average_reads <- sort(apply(exprs(eset), 1, mean), decreasing = TRUE)
+  plotData <- data.table(
+    ENSGID = names(average_reads),
+    av_count = average_reads
+  )
+
+  gen_descript <- data.table(
+    ENSGID = eset@featureData@data$gene_id,
+    Description = eset@featureData@data$GENENAME
+  )
+  order_gen_descript <- gen_descript[
+    match(plotData$ENSGID, gen_descript$ENSGID),
+  ]
+
+  g <- ggplot(
+    plotData[c(1:100)],
+    aes(x = reorder(ENSGID, -av_count), y = av_count)
+  ) + geom_bar(stat = "identity") +
+    theme(
+      axis.text.x = element_text(angle = 90, vjust = 0.5, hjust = 1, size = 12),
+      axis.text.y = element_text(size = 12),
+      legend.text = element_text(size = 15),
+      legend.title = element_text(size = 15),
+      axis.title = element_text(size = 18),
+      plot.title = element_text(size = 20)
+    ) + ylab("Average Counts") + xlab("Genes")
+
+  print(g)
+
+  cat("\n\n")
+  cat("<br>")
+  cat("<br>")
+
+  print(htmltools::tagList((DT::datatable(order_gen_descript[1:100, ]))))
+}
+```
+</div>
+
+
+<br>
+<br>
+<br>
+<br>
+
+# Glossary {.unnumbered}
+
+
+## Read {.unlisted .unnumbered}
+
+A read is a oligonucleotide (a short RNA fragment) that has been sequenced. It consists of a fixed number of base pairs (bp) and therefore has a specific read length.
+
+
+
+## Input Read {.unlisted .unnumbered}
+
+Each read of the fastq file used as input to the STAR aligner is considered an input read.
+
+
+
+## Read With Valid Barcode {.unlisted .unnumbered}
+
+A read with a valid barcode is a read for which the barcode matches the white list of barcodes under the given restriction of the number of allowed mismatches. The number of reads with a valid barcode is lower or equal to the number of input reads.
+
+
+
+## Mapped Read {.unlisted .unnumbered}
+
+A read that has been aligned against the reference genome and for which one or more suitable matching locations have been found is a mapped read. Depending on the number of allowed mismatches this might or might not be be an exact match. The number of mapped reads is lower or equal to the number of reads with a valid barcode.
+
+
+
+## Uniquely Mapped Read {.unlisted .unnumbered}
+
+A read for which one and only one suitable matching location in the reference genome was found is an uniquely mapped read. The number of uniquely mapped reads is lower or equal to the number of mapped reads.
+
+
+
+## Counted Read {.unlisted .unnumbered}
+
+A mapped read will only be counted if it overlaps (1 nucleotide or more) with one and only one gene. The number of counted reads is lower or equal to the number of (uniquely) mapped reads.
+
+
+
+## UMIs {.unlisted .unnumbered}
+
+Unique molecular identifiers (UMI) are short sequences in order to uniquely tag each molecule in a sample library. Sequencing with UMIs allows bioinformatics software to filter out duplicate reads and PCR errors with a high level of accuracy and report unique reads.
+
+The reported UMIs is the number of UMIs among the set of reads that map to an unique gene, i.e the number of reads is deduplicated.
+
+
+
+## pctERCC {.unlisted .unnumbered}
+
+The percentage of reads mapping to the ERCC genes among the total number of **mapped** reads.
+
+
+
+## pctMT {.unlisted .unnumbered}
+
+The percentage of reads mapping to the MT genes among the total number of **mapped** reads.
+
+
+
+## Sequencing Saturation {.unlisted .unnumbered}
+
+The sequencing saturation is a measure of the fraction of library complexity. The inverse of one minus the sequencing saturation can be interpreted as the number of additional reads it would take to detect a new transcript. Consequently, a low sequencing saturation indicates a shallow sequencing in which a new transcript could be discovered with a few reads.
+
+<br>
+<br>
+<br>
+<br>
+
+<center>
+![](OutputSTARsolo.png)
+</center>
+
+<br>
+<br>
--- a/src/report/test.R
+++ b/src/report/test.R
@@ -0,0 +1,41 @@
+library(whisker)
+library(testthat)
+library(R.utils)
+
+cat(">> Creating temporary directory \n")
+Sys.setenv(TMP = meta$temp_dir)
+temp_folder <- tempdir(check = TRUE)
+
+cat(">> Running component create_report for test case \n")
+
+input_dir <- file.path(meta$resources_dir, "test_data")
+stopifnot(file.exists(input_dir))
+
+
+out <- processx::run(meta$executable, c(
+  "--eset", file.path(meta$resources_dir, "test_data", "eset.sample_one.rds"),
+  "--eset", file.path(meta$resources_dir, "test_data", "eset.sample_two.rds"),
+  "--output_report", "report.html"
+))
+
+expect_equal(out$status, 0)
+expect_true(file.exists("report.html"))
+
+cat(">>  Test succesful \n")
+
+cat(">> Running component create_report with symbolic links \n")
+
+link_sample_1 <- file.path(temp_folder, "eset.sample_one.rds")
+link_sample_2 <- file.path(temp_folder, "eset.sample_two.rds")
+createLink(link = link_sample_1,
+           target = file.path(meta$resources_dir, "test_data", "eset.sample_one.rds"))
+createLink(link = link_sample_2,
+           target = file.path(meta$resources_dir, "test_data", "eset.sample_two.rds"))
+
+out <- processx::run(meta$executable, c(
+  "--eset", link_sample_1,
+  "--eset", link_sample_2,
+  "--output_report", "report2.html"
+))
+
+expect_true(file.exists("report2.html"))
--- a/src/report/test_data/eset.sample_one.rds
+++ b/src/report/test_data/eset.sample_one.rds
--- a/src/report/test_data/eset.sample_two.rds
+++ b/src/report/test_data/eset.sample_two.rds
--- a/src/stats/combine_star_logs/config.vsh.yaml
+++ b/src/stats/combine_star_logs/config.vsh.yaml
@@ -0,0 +1,72 @@
+name: combine_star_logs
+namespace: "stats"
+authors:
+  - __merge__: /src/base/authors/dries_schaumont.yaml
+    roles: [ author, maintainer ]
+argument_groups:
+  - name: "Arguments"
+    arguments:
+      - name: "--barcodes"
+        type: string
+        multiple: true
+        required: true
+        description: |
+          Barcodes responding to the respective log files.
+      - name: "--star_logs"
+        type: file
+        multiple: true
+        required: true
+        description: |
+          Paths to the STAR log files (most frequently called Log.final.out)
+        direction: input
+        example: "Log.final.out"
+      - name: "--gene_summary_logs"
+        direction: input
+        type: file
+        multiple: true
+        required: true
+        description: |
+          Paths to the Summary.csv files from the STAR Solo output. Can be found in
+          the 'Solo.out/Gene' folder relative to the root of the STAR output directory. 
+        example: "Summary.txt"
+      - name: "--reads_per_gene_logs"
+        direction: input
+        type: file
+        multiple: true
+        required: true
+        description: |
+          Paths to the 'ReadsPerGene.out.tab' files as output by STAR.
+      - name: "--output"
+        type: file
+        direction: output
+        default: "starLogs.txt"
+        description: |
+          Tab-delimited file describing for each barcode (as the rows), the metrics (as columns)
+          gathered from the different input files. 
+      
+resources:
+- type: python_script
+  path: script.py
+
+test_resources:
+  - type: python_script
+    path: test.py
+  - path: test_data
+
+engines:
+  - type: docker
+    image: python:3.12-slim
+    setup:
+      - type: apt
+        packages:
+          - procps
+      - type: python
+        packages:
+          - pandas
+    test_setup:
+      - type: python
+        packages:
+          - viashpy
+runners:
+  - type: executable
+  - type: nextflow
--- a/src/stats/combine_star_logs/script.py
+++ b/src/stats/combine_star_logs/script.py
@@ -0,0 +1,228 @@
+import logging
+import pandas as pd
+from itertools import batched, starmap
+
+### VIASH START
+meta = {
+    "name": "combine_star_logs",
+}
+par = {
+    "star_logs": ["src/stats/combine_star_logs/test_data/barcode_1/Log.final.out",
+                  "src/stats/combine_star_logs/test_data/barcode_2/Log.final.out"],
+    "gene_summary_logs": ["src/stats/combine_star_logs/test_data/barcode_1/summary.csv",
+                          "src/stats/combine_star_logs/test_data/barcode_2/summary.csv"], 
+    "reads_per_gene_logs": ["src/stats/combine_star_logs/test_data/barcode_1/ReadsPerGene.out.tab",
+                            "src/stats/combine_star_logs/test_data/barcode_2/ReadsPerGene.out.tab"],
+    "output": "output.txt",
+    "barcodes": ["ACGG", "TTTT"],
+}
+
+### VIASH END
+
+logger = logging.getLogger()
+console_handler = logging.StreamHandler()
+logger.addHandler(console_handler)
+logger.setLevel(logging.DEBUG)
+
+
+def handle_percentages(column_value):
+    # TODO: handle this more gracefully
+    if column_value:
+        return column_value.strip('%')
+    return column_value
+
+def star_log_to_dataframe(barcode: str, log_path) -> pd.DataFrame:
+    logger.info("Reading STAR log %s for barcode '%s'", log_path, barcode)
+    result = pd.read_table(log_path, sep=r"\|\t+", converters={"Value": handle_percentages},
+                           engine="python", header=None, skip_blank_lines=True,
+                           skipinitialspace=True, names=["Category", "Value"], index_col=0,
+                           skiprows=[0, 1, 2])
+    logger.info("Read %d row(s) and %d column(s) from STAR logs at %s", 
+                *result.shape, log_path)
+    return result
+
+
+def summary_to_dataframe(barcode: str, summary_path) -> pd.DataFrame:
+    logger.info("Reading summary log %s for barcode %s", summary_path, barcode)
+    result = pd.read_table(summary_path, sep=",",
+                           header=None, names=["Category", "Value"],
+                           index_col=0, dtype=pd.StringDtype())
+    logger.info("Read %d row(s) and %d column(s) from summary file at %s",
+                *result.shape, summary_path)
+    return result
+
+
+def reads_per_gene_to_dataframe(barcode, read_per_gene_path) -> pd.DataFrame:
+    logger.info("Reading reads per gene file %s for barcode %s", read_per_gene_path, barcode)
+    result = pd.read_table(read_per_gene_path, skiprows=[0, 1, 2, 3], header=None, sep="\t",
+                           dtype={"geneID": pd.StringDtype(),
+                                  "Unstranded": pd.Int64Dtype(),
+                                  "posStrand": pd.Int64Dtype(),
+                                  "negStrand": pd.Int64Dtype()},
+                           index_col=0, names=["geneID", "Unstranded", "posStrand", "negStrand"])
+    result = result[["Unstranded"]] # Do not use .loc here because we need a DataFrame, not a Series
+    df = pd.DataFrame({"Value": result.sum()})
+    df = df.rename({"Unstranded": "NumberOfCountedReads"}, errors="raise")
+    df.index.name = "Category"
+    logger.info("Read %d row(s) and %d column(s) from reads per gene file at %s",
+                *df.shape, read_per_gene_path)
+    return df
+
+def star_log_remove_unwanted_entries_and_adjust_format(barcode, df: pd.DataFrame) -> pd.DataFrame:
+    """
+    For a single star log (Log.final.out) in dataframe format, filter out the
+    entries that are not needed and format the labels for some metrics:
+        - Replace '%' with 'pect' in the labels.
+        - Remove labels ending with ':' 
+          (mostly the section separators like 'MULTI-MAPPING READS:' and 'UNMAPPED READS:')
+        - Remove the metrics we do no need based on the following keywords:
+          Mapping speed, Average, Number of splices, per base, chimeric reads, average
+    
+    The dataframe provided as input must have an index with 1 level with the metric names.
+    """
+    # Remove index values ending with ':' (rows like 'MULTI-MAPPING READS:','UNIQUE READS:')
+    logger.info("Filtering STAR logs for barcode %s. Starting with %d row(s) and %d column(s)", barcode, *df.shape)
+    to_keep = ~df.index.to_series().str.endswith(":")
+    # Remove index values where the values contain any of these substrings
+    regex_columns_to_remove = "Mapping speed|Average|Number of splices|per base|chimeric reads|average"
+    to_keep = to_keep & ~df.index.to_series().str.contains(regex_columns_to_remove, regex=True)
+    logger.info("Removed the following log entries for barcode '%s':\n\t%s",
+                barcode,
+                "\n\t".join(to_keep[~to_keep].index.to_list()))
+    result = df.loc[to_keep]
+
+    # Replace % by pect, remove columns, use camel case and remove spaces
+    # You might be tempted to use .title() to make everything uppercase,
+    # but characters which are already uppercase should stay that way.
+    # (example: NumberOfUMIs and not NumberOfUmis)
+    result.index = result.index.str.replace("%", "pect")\
+                    .str.replace(":", "")\
+                    .str.replace(r"(?:^|\s).", lambda m:m.group(0).upper(), regex=True)\
+                    .str.replace(" ", "")
+    result = result.rename({"UniquelyMappedReadsNumber": "NumberOfMappedReads", 
+                            "UniquelyMappedReadsPect": "PctMappedReads"}, errors="raise")
+    logger.info("Done filtering STAR logs for barcode %s. Result has %d row(s) and %d column(s). "
+                "Found entries:\n\t%s", 
+                barcode, *result.shape, "\n\t".join(result.index.to_list()))
+    return result
+
+
+def summary_remove_unwanted_entries_and_adjust_format(barcode, df: pd.DataFrame) -> pd.DataFrame:
+    logger.info("Filtering and formatting summary logs for barcode %s. "
+                "Starting with %d row(s) and %d column(s)", barcode, *df.shape)
+    columns_to_remove = (
+        "Number of Reads",
+        "Q30 Bases in RNA read",
+        "Reads Mapped to Genome: Unique",
+        "Reads Mapped to Transcriptome: Unique Genes",
+        "Reads in Cells Mapped to Unique Genes",
+        "Median UMI per Cell",
+        "Median Genes per Cell",
+        "Reads Mapped to Genome: Unique+Multiple",
+        "Median Reads per Cell",
+        "Mean UMI per Cell",
+        "Mean Genes per Cell",
+    )
+
+    to_keep = ~df.index.isin(columns_to_remove)
+    logger.info("Removed the following summary entries for barcode '%s':\n\t%s",
+                barcode,
+                "\n\t".join(df.loc[~to_keep].index.to_list()))
+    result = df.loc[to_keep]
+    result.index = result.index.str.replace(r"(?:^|\s).", lambda m:m.group(0).upper(),
+                                            regex=True).str.replace(" ", "")
+    to_rename = {"UMIsInCells": "NumberOfUMIs", 
+                 "TotalGenesDetected": "NumberOfGenes"}
+    try:
+        result = result.rename(to_rename, errors="raise")
+    except KeyError as e:
+        raise KeyError(f"Tried to rename log entries ({','.join(to_rename)}) in the summary "
+                       f"log for barcode {barcode}, but an entry was not found in the file. "
+                       "Make sure that you are using the correct version of STAR."
+                       f"Available entries: {", ".join(result.index.to_list())}") from e
+    logger.info("Done filtering summary logs for barcode %s. Result has %d row(s) and %d column(s). "
+                "Found entries:\n\t%s",
+                barcode, *result.shape, "\n\t".join(result.index.to_list()))
+    return result
+
+
+def join_dfs(df_list, barcodes) -> pd.DataFrame:
+    # Combine the dataframes together and add the barcodes as a level to the dataframe
+    # in order to make a 2-level index (first level the barcodes and second level the metrics).
+    result = pd.concat(dict(zip(barcodes, df_list)), names=["WellBC"])
+    # Pivot the table by moving the metrics to the columns. Its added as an extra level, 
+    # so we can just frop the 'Values' level that was already there
+    result = result.unstack(level="Category").droplevel(0, axis="columns")
+    return result
+
+def main(par):
+    logger.info("Component started.")
+    # Provide an overview of the parameters in the logs
+    parameters_str = [f'\t{param}: {param_val}\n' for param, param_val in par.items()]
+    logger.info("Parameters:\n%s", "".join(parameters_str).rstrip())
+    star_logs, gene_summary_logs, reads_per_gene_logs, barcodes  = par["star_logs"], \
+        par["gene_summary_logs"], par["reads_per_gene_logs"], par["barcodes"]
+    number_of_inputs = tuple(len(i) for i in (star_logs, gene_summary_logs,
+                                              reads_per_gene_logs, barcodes))
+    if len(set(number_of_inputs)) != 1:
+        raise ValueError("Expected the same number of inputs for 'star_logs' (%d), "
+                         "'gene_summary_logs' (%d), 'reads_per_gene_logs' (%d) "
+                         "and 'barcodes' (%d)." % number_of_inputs)
+    
+    logs_to_process = [
+        (star_log_to_dataframe, star_log_remove_unwanted_entries_and_adjust_format, star_logs),
+        (summary_to_dataframe, summary_remove_unwanted_entries_and_adjust_format, gene_summary_logs),
+        (reads_per_gene_to_dataframe, None, reads_per_gene_logs),
+    ]
+    logger.info("Formatting the contents of the log files.") 
+    all_logs_data = []
+    for df_generator, formatter, data in logs_to_process:
+        data_as_df = list(starmap(df_generator, zip(barcodes, data)))
+        data_formatted = data_as_df
+        if formatter:
+            data_formatted = list(starmap(formatter, zip(barcodes, data_as_df)))
+        data_joined = join_dfs(data_formatted, barcodes)
+        all_logs_data.append(data_joined)
+
+    logger.info("Joining entries across the different logs together.") 
+    all_stats = pd.concat(all_logs_data, axis=1)
+    logger.info("Log statistics were gathered for the following barcodes: %s", 
+                ", ".join(all_stats.index.to_list()))
+    dtypes = {
+        'NumberOfInputReads': pd.UInt64Dtype(),
+        'NumberOfMappedReads': pd.UInt64Dtype(),
+        'PctMappedReads': pd.Float64Dtype(),
+        'NumberOfReadsMappedToMultipleLoci': pd.UInt64Dtype(),
+        'PectOfReadsMappedToMultipleLoci':  pd.Float64Dtype(), 
+        'NumberOfReadsMappedToTooManyLoci': pd.UInt64Dtype(),
+        'PectOfReadsMappedToTooManyLoci':  pd.Float64Dtype(),
+        'NumberOfReadsUnmappedTooManyMismatches': pd.UInt64Dtype(),
+        'PectOfReadsUnmappedTooManyMismatches':  pd.Float64Dtype(),
+        'NumberOfReadsUnmappedTooShort': pd.UInt64Dtype(), 
+        'PectOfReadsUnmappedTooShort':  pd.Float64Dtype(),
+        'NumberOfReadsUnmappedOther': pd.UInt64Dtype(),
+        'PectOfReadsUnmappedOther': pd.Float64Dtype(),
+        'ReadsWithValidBarcodes': pd.Float64Dtype(),
+        'SequencingSaturation': pd.Float64Dtype(),
+        'Q30BasesInCB+UMI': pd.Float64Dtype(),
+        'ReadsMappedToTranscriptome:Unique+MultipeGenes': pd.Float64Dtype(),
+        'EstimatedNumberOfCells': pd.UInt64Dtype(),
+        'FractionOfReadsInCells': pd.Float64Dtype(),
+        'MeanReadsPerCell': pd.UInt64Dtype(),
+        'NumberOfUMIs': pd.UInt64Dtype(),
+        'NumberOfGenes': pd.UInt64Dtype(),
+        'NumberOfCountedReads': pd.UInt64Dtype(),
+    }
+    all_stats = all_stats.astype(dtypes) 
+    # batched() is used here to print a limited amount of columnns at a time
+    # to make sure that they are all displayed (pandas might limit the view for readability)
+    logger.info("Summary of final output:\n%s\n",
+                "\n".join(repr(all_stats.loc[:,columns].describe())
+                          for columns in batched(all_stats.columns, 3))) 
+    logger.info("Writing output to %s", par["output"])
+    all_stats.reset_index("WellBC").to_csv(par["output"], sep="\t", header=True,
+                                           index=False, float_format='%g')
+    logger.info("Finished %s.", meta["name"])
+
+if __name__ == "__main__":
+    main(par)
--- a/src/stats/combine_star_logs/test.py
+++ b/src/stats/combine_star_logs/test.py
@@ -0,0 +1,125 @@
+import pytest
+import sys
+import re
+import pandas as pd
+from pathlib import Path
+from uuid import uuid4
+from subprocess import CalledProcessError
+
+### VIASH START
+meta = {
+    "resources_dir": "./src/stats/combine_star_logs/",
+    "executable": "target/executable/stats/combine_star_logs/combine_star_logs",
+    "config": "src/stats/combine_star_logs/config.vsh.yaml"
+}
+### VIASH END
+
+@pytest.fixture
+def test_resources_path():
+    return Path(meta["resources_dir"]) / "test_data"
+
+@pytest.fixture
+def barcode_1_star_log(test_resources_path):
+    return test_resources_path / "barcode_1" / "Log.final.out"
+
+@pytest.fixture
+def barcode_1_reads_per_gene_file(test_resources_path):
+    return test_resources_path / "barcode_1" / "ReadsPerGene.out.tab"
+
+@pytest.fixture
+def barcode_1_summary(test_resources_path):
+    return test_resources_path / "barcode_1" / "summary.csv"
+
+@pytest.fixture
+def barcode_2_star_log(test_resources_path):
+    return test_resources_path / "barcode_2" / "Log.final.out"
+
+@pytest.fixture
+def barcode_2_reads_per_gene_file(test_resources_path):
+    return test_resources_path / "barcode_2" / "ReadsPerGene.out.tab"
+
+@pytest.fixture
+def barcode_2_summary(test_resources_path):
+    return test_resources_path / "barcode_2" / "summary.csv"
+
+@pytest.fixture
+def random_path(tmp_path):
+    def wrapper(extension=None):
+        extension = "" if not extension else f".{extension}"
+        return tmp_path / f"{uuid4()}{extension}"
+    return wrapper 
+
+def test_incorrect_number_of_inputs_raises(run_component,
+                                           barcode_1_star_log, barcode_2_star_log,
+                                           barcode_1_reads_per_gene_file, barcode_2_reads_per_gene_file,
+                                           barcode_1_summary, barcode_2_summary,
+                                           random_path):
+    output_path = random_path("txt")
+    with pytest.raises(CalledProcessError) as err:
+        run_component([
+            "--barcodes", "foo;bar",
+            "--star_logs", f"{barcode_1_star_log}", 
+            "--reads_per_gene_logs", f"{barcode_1_reads_per_gene_file};{barcode_2_reads_per_gene_file}",
+            "--gene_summary_logs", f"{barcode_1_summary};{barcode_2_summary}",
+            "--output", output_path,
+        ])
+    assert re.search(r"ValueError: Expected the same number of inputs for 'star_logs' \(1\), "
+                     r"'gene_summary_logs' \(2\), 'reads_per_gene_logs' \(2\) and 'barcodes' \(2\)\.",
+            err.value.stdout.decode('utf-8'))
+
+
+
+def test_equal_number_of_argument(run_component,
+                                  barcode_1_star_log, barcode_2_star_log,
+                                  barcode_1_reads_per_gene_file, barcode_2_reads_per_gene_file,
+                                  barcode_1_summary, barcode_2_summary,
+                                  random_path):
+    output_path = random_path("txt")
+    run_component([
+        "--barcodes", "foo;bar",
+        "--star_logs", f"{barcode_1_star_log};{barcode_2_star_log}", 
+        "--reads_per_gene_logs", f"{barcode_1_reads_per_gene_file};{barcode_2_reads_per_gene_file}",
+        "--gene_summary_logs", f"{barcode_1_summary};{barcode_2_summary}",
+        "--output", output_path,
+    ])
+    # We use strings here to make a comparison of the file contents without
+    # doing any inferences of the numerical data type (i.e. exact file contents).
+    expected_dict = {
+        'NumberOfInputReads': ["96398", "10155"], 
+        'NumberOfMappedReads': ["70824", "7179"], 
+        'PctMappedReads': ["73.47", "70.69"], 
+        'NumberOfReadsMappedToMultipleLoci': ["0", "0"], 
+        'PectOfReadsMappedToMultipleLoci': ["0", "0"], 
+        'NumberOfReadsMappedToTooManyLoci': ["22281", "2248"],
+        'PectOfReadsMappedToTooManyLoci': ["23.11", "22.14"],
+        'NumberOfReadsUnmappedTooManyMismatches': ["0", "0"], 
+        'PectOfReadsUnmappedTooManyMismatches': ["0", "0"], 
+        'NumberOfReadsUnmappedTooShort': ["2697", "553"], 
+        'PectOfReadsUnmappedTooShort': ["2.8", "5.45"], 
+        'NumberOfReadsUnmappedOther': ["596", "175"], 
+        'PectOfReadsUnmappedOther': ["0.62", "1.72"], 
+        'ReadsWithValidBarcodes': ["0.999782", "0.999803"],
+        'SequencingSaturation': ["0.0602963", "0.0539344"], 
+        'Q30BasesInCB+UMI': ["0.980096", "0.984461"],
+        'ReadsMappedToTranscriptome:Unique+MultipeGenes': ["0.60411", "0.530871"],
+        'EstimatedNumberOfCells': ["1", "1"],
+        'FractionOfReadsInCells': ["1", "1"],
+        'MeanReadsPerCell': ["53602", "4969"],
+        'NumberOfUMIs': ["50370", "4701"], 
+        'NumberOfGenes': ["8767", "2397"],
+        'NumberOfCountedReads': ["17", "15"],
+    }
+    expected = pd.DataFrame.from_dict(expected_dict, dtype=pd.StringDtype())
+    expected.index = pd.Index(["foo", "bar"], name="WellBC", dtype=pd.StringDtype())
+    assert output_path.is_file()
+
+    contents = pd.read_csv(output_path, sep="\t", index_col=0, dtype=pd.StringDtype())
+    assert set(("NumberOfInputReads", "SequencingSaturation",
+                "NumberOfGenes", "NumberOfUMIs", "NumberOfCountedReads",
+                "PctMappedReads")).issubset(set(contents.columns))
+    pd.testing.assert_frame_equal(contents, expected)
+
+
+
+if __name__ == '__main__':
+    sys.exit(pytest.main([__file__]))
--- a/src/stats/combine_star_logs/test_data/barcode_1/Log.final.out
+++ b/src/stats/combine_star_logs/test_data/barcode_1/Log.final.out
@@ -0,0 +1,37 @@
+                                 Started job on |	Jun 26 09:38:11
+                             Started mapping on |	Jun 26 09:38:14
+                                    Finished on |	Jun 26 09:38:23
+       Mapping speed, Million of reads per hour |	38.56
+
+                          Number of input reads |	96398
+                      Average input read length |	57
+                                    UNIQUE READS:
+                   Uniquely mapped reads number |	70824
+                        Uniquely mapped reads % |	73.47%
+                          Average mapped length |	56.93
+                       Number of splices: Total |	6432
+            Number of splices: Annotated (sjdb) |	6285
+                       Number of splices: GT/AG |	6331
+                       Number of splices: GC/AG |	33
+                       Number of splices: AT/AC |	2
+               Number of splices: Non-canonical |	66
+                      Mismatch rate per base, % |	0.61%
+                         Deletion rate per base |	0.01%
+                        Deletion average length |	1.38
+                        Insertion rate per base |	0.00%
+                       Insertion average length |	1.24
+                             MULTI-MAPPING READS:
+        Number of reads mapped to multiple loci |	0
+             % of reads mapped to multiple loci |	0.00%
+        Number of reads mapped to too many loci |	22281
+             % of reads mapped to too many loci |	23.11%
+                                  UNMAPPED READS:
+  Number of reads unmapped: too many mismatches |	0
+       % of reads unmapped: too many mismatches |	0.00%
+            Number of reads unmapped: too short |	2697
+                 % of reads unmapped: too short |	2.80%
+                Number of reads unmapped: other |	596
+                     % of reads unmapped: other |	0.62%
+                                  CHIMERIC READS:
+                       Number of chimeric reads |	0
+                            % of chimeric reads |	0.00%
--- a/src/stats/combine_star_logs/test_data/barcode_1/ReadsPerGene.out.tab
+++ b/src/stats/combine_star_logs/test_data/barcode_1/ReadsPerGene.out.tab
@@ -0,0 +1,8 @@
+N_unmapped	11111	22222	33333
+N_multimapping	0	0	0
+N_noFeature	44444	55555	66666
+N_ambiguous	77777	88888	99999
+gene1	2	0	0
+gene2	0	0	0
+gene3	6	0	6
+gene5	9	6	3
--- a/src/stats/combine_star_logs/test_data/barcode_1/summary.csv
+++ b/src/stats/combine_star_logs/test_data/barcode_1/summary.csv
@@ -0,0 +1,20 @@
+Number of Reads,96398
+Reads With Valid Barcodes,0.999782
+Sequencing Saturation,0.0602963
+Q30 Bases in CB+UMI,0.980096
+Q30 Bases in RNA read,0.799904
+Reads Mapped to Genome: Unique+Multiple,0.734704
+Reads Mapped to Genome: Unique,0.734704
+Reads Mapped to Transcriptome: Unique+Multipe Genes,0.60411
+Reads Mapped to Transcriptome: Unique Genes,0.556049
+Estimated Number of Cells,1
+Reads in Cells Mapped to Unique Genes,53602
+Fraction of Reads in Cells,1
+Mean Reads per Cell,53602
+Median Reads per Cell,53602
+UMIs in Cells,50370
+Mean UMI per Cell,50370
+Median UMI per Cell,50370
+Mean Genes per Cell,8767
+Median Genes per Cell,8767
+Total Genes Detected,8767
--- a/src/stats/combine_star_logs/test_data/barcode_2/Log.final.out
+++ b/src/stats/combine_star_logs/test_data/barcode_2/Log.final.out
@@ -0,0 +1,37 @@
+                                 Started job on |	Jun 26 09:38:56
+                             Started mapping on |	Jun 26 09:39:00
+                                    Finished on |	Jun 26 09:39:02
+       Mapping speed, Million of reads per hour |	18.28
+
+                          Number of input reads |	10155
+                      Average input read length |	57
+                                    UNIQUE READS:
+                   Uniquely mapped reads number |	7179
+                        Uniquely mapped reads % |	70.69%
+                          Average mapped length |	56.36
+                       Number of splices: Total |	526
+            Number of splices: Annotated (sjdb) |	495
+                       Number of splices: GT/AG |	502
+                       Number of splices: GC/AG |	4
+                       Number of splices: AT/AC |	1
+               Number of splices: Non-canonical |	19
+                      Mismatch rate per base, % |	0.85%
+                         Deletion rate per base |	0.00%
+                        Deletion average length |	1.09
+                        Insertion rate per base |	0.00%
+                       Insertion average length |	1.07
+                             MULTI-MAPPING READS:
+        Number of reads mapped to multiple loci |	0
+             % of reads mapped to multiple loci |	0.00%
+        Number of reads mapped to too many loci |	2248
+             % of reads mapped to too many loci |	22.14%
+                                  UNMAPPED READS:
+  Number of reads unmapped: too many mismatches |	0
+       % of reads unmapped: too many mismatches |	0.00%
+            Number of reads unmapped: too short |	553
+                 % of reads unmapped: too short |	5.45%
+                Number of reads unmapped: other |	175
+                     % of reads unmapped: other |	1.72%
+                                  CHIMERIC READS:
+                       Number of chimeric reads |	0
+                            % of chimeric reads |	0.00%
--- a/src/stats/combine_star_logs/test_data/barcode_2/ReadsPerGene.out.tab
+++ b/src/stats/combine_star_logs/test_data/barcode_2/ReadsPerGene.out.tab
@@ -0,0 +1,8 @@
+N_unmapped	101010	202020	303030
+N_multimapping	0	0	0
+N_noFeature	404040	505050	606060
+N_ambiguous	707070	808080	909090
+gene1	0	0	0
+gene2	0	0	0
+gene6	5	5	0
+gene4	10	2	8
--- a/src/stats/combine_star_logs/test_data/barcode_2/summary.csv
+++ b/src/stats/combine_star_logs/test_data/barcode_2/summary.csv
@@ -0,0 +1,20 @@
+Number of Reads,10155
+Reads With Valid Barcodes,0.999803
+Sequencing Saturation,0.0539344
+Q30 Bases in CB+UMI,0.984461
+Q30 Bases in RNA read,0.786064
+Reads Mapped to Genome: Unique+Multiple,0.706942
+Reads Mapped to Genome: Unique,0.706942
+Reads Mapped to Transcriptome: Unique+Multipe Genes,0.530871
+Reads Mapped to Transcriptome: Unique Genes,0.489316
+Estimated Number of Cells,1
+Reads in Cells Mapped to Unique Genes,4969
+Fraction of Reads in Cells,1
+Mean Reads per Cell,4969
+Median Reads per Cell,4969
+UMIs in Cells,4701
+Mean UMI per Cell,4701
+Median UMI per Cell,4701
+Mean Genes per Cell,2397
+Median Genes per Cell,2397
+Total Genes Detected,2397
--- a/src/stats/generate_pool_statistics/config.vsh.yaml
+++ b/src/stats/generate_pool_statistics/config.vsh.yaml
@@ -0,0 +1,56 @@
+name: generate_pool_statistics
+namespace: "stats"
+authors:
+  - __merge__: /src/base/authors/dries_schaumont.yaml
+    roles: [ author, maintainer ]
+  - __merge__: /src/base/authors/marijke_van_moerbeke.yaml
+    roles: [ contributor ]
+argument_groups:
+  - name: "Arguments"
+    arguments:
+      - name: "--nrReadsNrGenesPerChrom"
+        type: file
+        multiple: true
+        description: |
+          Path to an output file that contains a .tsv formatted table describing
+          per chromosome the number of reads that were mapped to that chromosome (NumberOfReads
+          column) and the number of genes on that chromosome that had at least one
+          read mapped to it (NumberOfGenes).
+        direction: input
+        default: [processedBamFile_well1.tsv, processedBamfile_well2.tsv]
+      - name: "--nrReadsNrGenesPerChromPool"
+        direction: output
+        type: file
+        multiple: false
+        description: |
+          Pivot table in tsv format of the combined input nrReadsNrGenesPerChrom files. Describes
+          per chromosome (as columns) the number of reads, as well as the total number 
+          of reads per cell barcode and the percentage of nuclear, ERCC and mitochondrial
+          reads.
+        example: "nrReadsNrGenesPerChrom.txt"
+
+resources:
+- type: python_script
+  path: script.py
+
+test_resources:
+  - type: python_script
+    path: test.py
+
+engines:
+  - type: docker
+    image: python:3.12-slim
+    setup:
+      - type: apt
+        packages:
+          - procps
+      - type: python
+        packages:
+          - pandas
+    test_setup:
+      - type: python
+        packages:
+          - viashpy
+runners:
+  - type: executable
+  - type: nextflow
--- a/src/stats/generate_pool_statistics/script.py
+++ b/src/stats/generate_pool_statistics/script.py
@@ -0,0 +1,88 @@
+import pandas as pd
+import re
+
+### VIASH START
+par = {
+    "nrReadsNrGenesPerChrom": ["src/stats/generate_pool_statistics/test1.tsv", "src/stats/generate_pool_statistics/test2.tsv"],
+    "nrReadsNrGenesPerChromPool": "nrReadsNrGenesPerChrom_pool.txt"
+}
+
+### VIASH END
+
+INDEX_COL = ["WellBC", "WellID"]
+
+if __name__ == "__main__":
+    #########
+    # nrReadsNrGenesPerChrom file
+    #########
+    nr_reads_nr_genes_wells = []
+    for nr_reads_nr_genes_file in par["nrReadsNrGenesPerChrom"]:
+        nr_reads_nr_genes_wells.append(pd.read_csv(nr_reads_nr_genes_file, 
+                                                   header=0, delimiter="\t",
+                                                   dtype={"WellBC":	pd.StringDtype(),
+                                                          "WellID": pd.StringDtype(),
+                                                          "Chr": pd.StringDtype(),
+                                                          "NumberOfReads": pd.UInt64Dtype(),
+                                                          "NumberOfGenes": pd.UInt64Dtype()}))
+    nr_reads_nr_genes_pool = pd.concat(nr_reads_nr_genes_wells, ignore_index=True,)
+    total_nr_reads_per_chromosome = nr_reads_nr_genes_pool.pivot_table(index=INDEX_COL, columns="Chr",
+                                                                       values=["NumberOfReads"], fill_value=0,
+                                                                       aggfunc="sum").droplevel(0, axis=1)
+    total_nr_reads_per_chromosome.columns.name = None
+
+    ##### Total number of genes from all chromosomes
+    total_nr_genes = nr_reads_nr_genes_pool.loc[:, INDEX_COL + ['NumberOfGenes']].groupby(["WellBC", "WellID"]).sum()
+
+    ##### Total counts across (irrespective of chromosome)
+    total_sum_of_reads = total_nr_reads_per_chromosome.sum(numeric_only=True, axis=1) 
+
+    ##### Logic to split up chromosome per type
+    chromosome_names = total_nr_reads_per_chromosome.columns.to_list()
+    chr_regex = re.compile(r"^(chr)?\d+")
+    matching_chromosomes = [chr_name for chr_name 
+                            in chromosome_names
+                            if chr_regex.match(chr_name)]
+    sex_chromosome_names = ["X", "Y"]
+    mitochondrial_chr_name = "MT"
+    # This is logic from the original HT pipeline,
+    # only when all of the matched chromosomes start with "chr", the mitochonrial, X and Y
+    # chromosomes should also start with 'chr'
+    if all(chr_name.startswith("chr") for chr_name in matching_chromosomes):
+       sex_chromosome_names += ["chrX", "chrY"]
+       mitochondrial_chr_name = "chrM"
+
+    ###### Counts for mitochondrial reads
+    try:
+        mitochondrial_reads = total_nr_reads_per_chromosome.loc[:,mitochondrial_chr_name]
+    except KeyError:
+       mitochondrial_reads = 0
+    percentage_mitochondrial_reads = round(mitochondrial_reads / total_sum_of_reads * 100, 2)
+
+    ###### Counts for ERCC reads
+    total_ercc_reads = total_nr_reads_per_chromosome.filter(regex=r"^ERCC").sum(axis=1)
+    percentage_ercc_reads = round(total_ercc_reads / total_sum_of_reads * 100, 2)
+
+    ###### Counts for nuclear chromosomes
+    total_chromosomal_reads = total_nr_reads_per_chromosome.loc[:,matching_chromosomes].sum(axis=1)
+    percentage_chromosomal_reads = round(total_chromosomal_reads / total_sum_of_reads * 100, 2)
+
+    cols_to_add = {
+        "pctChrom": percentage_chromosomal_reads,
+        "pctMT": percentage_mitochondrial_reads,
+        "pctERCC": percentage_ercc_reads,
+        "SumReads": total_sum_of_reads,
+        "NumberOfGenes": total_nr_genes,
+        "NumberOfERCCReads": total_ercc_reads,
+        "NumberOfChromReads": total_chromosomal_reads,
+        "NumberOfMTReads": mitochondrial_reads,
+    }
+    total_nr_reads_per_chromosome = total_nr_reads_per_chromosome.assign(
+       **cols_to_add
+    )
+
+    total_nr_reads_per_chromosome.reset_index(names=INDEX_COL)\
+        .to_csv(par["nrReadsNrGenesPerChromPool"], sep="\t",
+                header=True, index=False, float_format="%g",
+                columns=tuple(INDEX_COL) + tuple(chromosome_names) + tuple(cols_to_add.keys())
+               )
+
--- a/src/stats/generate_pool_statistics/test.py
+++ b/src/stats/generate_pool_statistics/test.py
@@ -0,0 +1,174 @@
+from uuid import uuid4
+from textwrap import dedent
+import pandas as pd
+import pytest
+import sys
+
+### VIASH START
+meta = {
+    "resources_dir": "./src/stats/generate_pool_statistics/",
+    "executable": "target/executable/stats/generate_pool_statistics/generate_pool_statistics",
+    "config": "src/stats/generate_pool_statistics/config.vsh.yaml"
+}
+### VIASH END
+
+@pytest.fixture
+def random_path(tmp_path):
+    def wrapper(extension=None):
+        extension = "" if not extension else f".{extension}"
+        return tmp_path / f"{uuid4()}{extension}"
+    return wrapper
+
+
+@pytest.fixture
+def random_tsv_path(random_path):
+    def wrapper():
+        return random_path(".tsv")
+    return wrapper
+
+
+@pytest.fixture
+def simple_input_file_one(random_tsv_path, request):
+    prefix = request.param
+    mito_name = f"{prefix}M{'T' if not prefix else ''}"
+
+    contents = dedent(
+    f"""\
+    WellBC	WellID	Chr	NumberOfReads	NumberOfGenes
+    AGG	A1	{prefix}1	2	1
+    AGG	A1	{prefix}2	3	2
+    AGG	A1	{prefix}3	4	2
+    AGG	A1	{mito_name}	4	2
+    AGG	A1	{prefix}X	2	3
+    AGG	A1	ERCC-1	1	1
+    AGG	A1	ERCC-2	1	1
+    """)
+    output_file = random_tsv_path()
+    with output_file.open("w") as open_file:
+        open_file.write(contents)
+    return output_file
+
+
+@pytest.fixture
+def simple_input_file_two(random_tsv_path, request):
+    prefix = request.param
+    contents = dedent(
+    f"""\
+    WellBC	WellID	Chr	NumberOfReads	NumberOfGenes
+    CCC	B2	{prefix}2	2	1
+    CCC	B2	{prefix}3	3	2
+    CCC	B2	{prefix}5	4	2
+    CCC	B2	{prefix}1	4	2
+    CCC	B2	{prefix}Y	2	3
+    CCC	B2	{prefix}X	2	3
+    CCC	B2	ERCC-3	1	1
+    CCC	B2	ERCC-2	1	1
+    """)
+    output_file = random_tsv_path()
+    with output_file.open("w") as open_file:
+        open_file.write(contents)
+    return output_file
+
+
+@pytest.mark.parametrize("simple_input_file_one,simple_input_file_two,expected", [("chr", "chr", "chr"), ("", "", "")], 
+                         indirect=["simple_input_file_one", "simple_input_file_two"])
+def test_generate_pool_statistics_simple(run_component, simple_input_file_one,
+                                         simple_input_file_two, random_tsv_path, expected):
+    
+    output_path = random_tsv_path()
+    run_component([
+        "--nrReadsNrGenesPerChrom", simple_input_file_one,
+        "--nrReadsNrGenesPerChrom", simple_input_file_two,
+        "--nrReadsNrGenesPerChromPool", output_path
+    ])
+    mito_name = f"{expected}M{'T' if not expected else ''}"
+    expected_dict = {
+        "WellBC": ["AGG", "CCC"],
+        "WellID": ["A1", "B2"],
+        "ERCC-1": ["1", "0"],
+        "ERCC-2": ["1", "1"],
+        "ERCC-3": ["0", "1"],
+        f"{expected}1": ["2", "4"],
+        f"{expected}2": ["3", "2"],
+        f"{expected}3": ["4", "3"],
+        f"{expected}5": ["0", "4"],
+        f"{mito_name}": ["4", "0"],
+        f"{expected}X": ["2", "2"],
+        f"{expected}Y": ["0", "2"],
+        "SumReads": ["17", "19"],
+        "pctMT": ["23.53", "0"],
+        "pctERCC": ["11.76", "10.53"],
+        "pctChrom": ["52.94", "68.42"],
+        "NumberOfGenes": ["12", "15"],
+        "NumberOfMTReads": ["4", "0"],
+        "NumberOfChromReads": ["9", "13"],
+        "NumberOfERCCReads": ["2", "2"],
+    }
+    expected_frame = pd.DataFrame.from_dict(expected_dict, dtype=pd.StringDtype())
+    assert output_path.is_file()
+    contents = pd.read_csv(output_path, sep="\t", dtype=pd.StringDtype())
+    pd.testing.assert_frame_equal(contents, expected_frame, check_like=True)
+
+
+def test_only_numerical_chromosomes(run_component, random_tsv_path):
+    """
+    The chromosome column might be read as an integer instead of a string,
+    make sure that a numerical column only works.
+    """
+    output_path = random_tsv_path()
+    contents1 = dedent(
+    f"""\
+    WellBC	WellID	Chr	NumberOfReads	NumberOfGenes
+    CCC	B2	2	2	1
+    CCC	B2	3	3	2
+    CCC	B2	5	4	2
+    CCC	B2	1	4	2
+    """)
+    input_file_1 = random_tsv_path()
+    with input_file_1.open("w") as open_file:
+        open_file.write(contents1)
+
+    contents2 = dedent(
+    f"""\
+    WellBC	WellID	Chr	NumberOfReads	NumberOfGenes
+    AGG	A1	2	2	1
+    AGG	A1	3	3	2
+    AGG	A1	5	4	2
+    AGG	A1	1	4	2
+    """)
+    input_file_2 = random_tsv_path()
+    with input_file_2.open("w") as open_file:
+        open_file.write(contents2)
+        output_path = random_tsv_path()
+    run_component([
+        "--nrReadsNrGenesPerChrom", input_file_1,
+        "--nrReadsNrGenesPerChrom", input_file_2,
+        "--nrReadsNrGenesPerChromPool", output_path
+    ])
+
+    expected_dict = {
+        "WellBC": ["AGG", "CCC"],
+        "WellID": ["A1", "B2"],
+        "1": ["4", "4"],
+        "2": ["2", "2"],
+        "3": ["3", "3"],
+        "5": ["4", "4"],
+        "pctChrom": ["100", "100"],
+        "pctMT": ["0", "0"],
+        "pctERCC": ["0", "0"],
+        "SumReads": ["13", "13"],
+        "NumberOfGenes": ["7", "7"],
+        "NumberOfERCCReads": ["0", "0"],
+        "NumberOfChromReads": ["13", "13"],
+        "NumberOfMTReads": ["0", "0"],
+    }
+    expected_frame = pd.DataFrame.from_dict(expected_dict,
+                                            dtype=pd.StringDtype())
+
+    assert output_path.is_file()
+    contents = pd.read_csv(output_path, sep="\t", dtype=pd.StringDtype())
+    pd.testing.assert_frame_equal(contents, expected_frame, check_like=True)
+
+
+if __name__ == '__main__':
+    sys.exit(pytest.main([__file__]))
--- a/src/stats/generate_well_statistics/config.vsh.yaml
+++ b/src/stats/generate_well_statistics/config.vsh.yaml
@@ -0,0 +1,102 @@
+name: generate_well_statistics
+namespace: "stats"
+description: Generate summary statistics from BAM files generated by STAR solo.
+authors:
+  - __merge__: /src/base/authors/dries_schaumont.yaml
+    roles: [ author, maintainer ]
+  - __merge__: /src/base/authors/marijke_van_moerbeke.yaml
+    roles: [ contributor ]
+argument_groups:
+  - name: "Arguments"
+    arguments:
+      - name: "--input"
+        type: file
+        description: "The .bam file as returned by the mapping tool STAR."
+        direction: input
+        example: "input.bam"
+      - name: "--barcode"
+        type: string
+        description: |
+          The barcode for the well that is being processed. Is only used to add a metadata
+          column to all output files.
+        required: true
+      - name: "--well_id"
+        type: string
+        description: |
+          ID of this well. Only used to add a metadata column to the output files.
+        required: true
+      - name: "--processedBAMFile"
+        type: file
+        description: |
+          Path to a .tsv file listing, per read in the BAM file,
+          the value for the "CB", "UX", "GX" and "GN" tag, together with the
+          chromsome to which the read was mapped to.
+        direction: output
+        default: processedBamFile.txt
+      - name: "--nrReadsNrGenesPerChrom"
+        type: file
+        description: |
+          Path to an output file that contains a .tsv formatted table describing
+          per chromosome the number of reads that were mapped to that chromosome (NumberOfReads
+          column) and the number of genes on that chromosome that had at least one
+          read mapped to it (NumberOfGenes).
+        default: nrReadsNrGenesPerChrom.txt
+        direction: output
+      - name: "--nrReadsNrUMIsPerCB"
+        type: file
+        description: |
+          Path to an output file that contains a .tsv formatted table describing
+          per barcode the number of UMI's (nrUMIs) and the total number of reads (NumberOfReads).
+        direction: output
+        default: nrReadsNrUMIsPerCB.txt
+      - name: "--umiFreqTop"
+        type: file
+        description: |
+          Path to an output file that contains a .tsv formatted table describing
+          per UMI (column UB) the frequency at which they occur in the reads (column
+          N). Only the top 100 UMIs are included.
+        default: umiFreqTop100.txt
+        direction: output
+      - name: "--threads"
+        type: integer
+        description: |
+          Number of threads to use for decompressing BAM files.
+        min: 1
+        default: 1
+resources:
+- type: python_script
+  path: script.py
+
+test_resources:
+  - type: python_script
+    path: test.py
+  - path: test.sam
+
+engines:
+  - type: docker
+    image: debian:stable-slim
+    setup:
+      - type: docker
+        env:
+          - PIP_BREAK_SYSTEM_PACKAGES=1
+          - HTSLIB_LIBRARY_DIR=/usr/lib/
+          - HTSLIB_INCLUDE_DIR=/usr/include/
+      - type: apt
+        packages:
+          - python3
+          - python3-pip
+          - python3-venv
+          - python-is-python3
+          - libhts-dev
+          - procps
+      - type: python
+        packages:
+          - pysam
+          - pandas
+    test_setup:
+      - type: python
+        packages:
+          - viashpy
+runners:
+  - type: executable
+  - type: nextflow
--- a/src/stats/generate_well_statistics/script.py
+++ b/src/stats/generate_well_statistics/script.py
@@ -0,0 +1,77 @@
+import pysam
+import pandas as pd
+import logging
+
+### VIASH START
+par = {
+    "input": "src/stats/generate_well_statistics/test.sam",
+    "processedBAMFile": "processedBamFile.txt",
+    "nrReadsNrGenesPerChrom": "nrReadsNrGenesPerChrom.txt",
+    "nrReadsNrUMIsPerCB": "nrReadsNrUMIsPerCB.txt",
+    "umiFreqTop": "umiFreqTop.txt",
+    "threads": 1,
+    "barcode": "ACGT"
+}
+### VIASH END
+logger = logging.getLogger()
+console_handler = logging.StreamHandler()
+logger.addHandler(console_handler)
+logger.setLevel(logging.DEBUG)
+
+if __name__ == "__main__":
+    logger.info("Component started.")
+    parameters_str = [f'\t{param}: {param_val}\n' for param, param_val in par.items()]
+    logger.info("Parameters:\n%s", "".join(parameters_str).rstrip())
+    logger.info("Opening '%s'", par["input"])
+    samfile = pysam.AlignmentFile(par["input"], "rb", threads=par["threads"])
+    all_tags = []
+    index = []
+    tags_selection = ("CB", "UB", "GX", "GN")
+    for aligned_segment in samfile:
+        tags = dict(aligned_segment.get_tags())
+        all_tags.append(tags)
+        reference_name = aligned_segment.reference_name
+        index.append("*" if not reference_name else reference_name)
+    tag_dataframe = pd.DataFrame.from_records(all_tags, index=index,
+                                              columns=tags_selection)
+    tag_dataframe_to_write = tag_dataframe.copy()
+    logger.info("Done reading BAM file. Found %i entries", tag_dataframe.shape[0])
+    tag_dataframe.assign(WellBC=par["barcode"], WellID=par["well_id"])\
+        .reset_index(names="Chr")\
+        .to_csv(par["processedBAMFile"], sep="\t", na_rep="",
+                header=True, index=False,
+                columns=("WellBC", "WellID", "Chr") + tags_selection)
+    logger.info("Constructing of dataframe done.")
+    # Number of genes that had a read mapped to them per chromosome,
+    # and the number of reads mapped to those genes per chromosome.
+    nr_reads_nr_genes = tag_dataframe.dropna(subset=["GX"]).groupby(level=0).agg(
+        NumberOfReads=pd.NamedAgg("GX", aggfunc="size"),
+        NumberOfGenes=pd.NamedAgg(column="GX", aggfunc="nunique")
+    )
+    logger.info("Done calculating number of reads per gene and per chromesome. Writing to %s",
+                par['nrReadsNrGenesPerChrom'])
+    nr_reads_nr_genes.reset_index(names="Chr").assign(WellBC=par["barcode"], WellID=par["well_id"])\
+        .to_csv(par["nrReadsNrGenesPerChrom"], sep="\t",
+                header=True, index=False, 
+                columns=("WellBC", "WellID", "Chr", "NumberOfReads", "NumberOfGenes"))
+
+    # Number of reads mapped to the reference, grouped by UMI
+    nr_read_per_umi = tag_dataframe.groupby('UB').size()\
+        .drop("", errors="ignore").sort_values(ascending=False).head(100)
+    nr_read_per_umi_df = nr_read_per_umi.to_frame(name="N")
+    logger.info("Done calculating number of mapped reads per UMI, writing to %s", par["umiFreqTop"])
+    nr_read_per_umi_df.assign(WellBC=par["barcode"], WellID=par["well_id"]).reset_index(names="UB")\
+        .to_csv(par["umiFreqTop"], header=True, sep="\t", 
+                index=False, columns=("WellBC", "WellID", "UB", "N"))
+
+    # Total number of mapped reads and total number of UMIs (not grouped per chromosome)
+    nr_reads_and_umi_per_barcode = tag_dataframe.groupby(by="CB").agg(
+        NumberOfReads=pd.NamedAgg("CB", "size"),
+        nrUMIs=pd.NamedAgg("UB", "nunique")
+    )
+    logger.info("Done calculating number of mapped reads and number of UMIs per Cell Barcode, writing to %s",
+                par["nrReadsNrUMIsPerCB"])
+    nr_reads_and_umi_per_barcode.assign(WellBC=par["barcode"], WellID=par["well_id"]).reset_index(names="CB")\
+        .to_csv(par["nrReadsNrUMIsPerCB"], sep="\t", header=True, 
+                index=False, columns=("WellBC", "WellID", "CB", "NumberOfReads", "nrUMIs"))
+    logger.info("Finished!")
--- a/src/stats/generate_well_statistics/test.py
+++ b/src/stats/generate_well_statistics/test.py
@@ -0,0 +1,111 @@
+import sys
+import pytest
+import pysam
+from uuid import uuid4
+from pathlib import Path
+from textwrap import dedent
+
+### VIASH START
+meta = {
+    "resources_dir": "./src/stats/generate_well_statistics/",
+    "executable": "target/executable/stats/generate_well_statistics/generate_well_statistics",
+    "config": "src/stats/generate_well_statistics/config.vsh.yaml"
+}
+### VIASH END
+
+def assert_file_content_equals(file_to_check, expected):
+    with file_to_check.open('r') as open_file:
+        contents = open_file.read()
+        assert contents == expected
+
+
+@pytest.fixture
+def input_sam_path():
+    return Path(meta["resources_dir"]) / "test.sam"
+
+
+@pytest.fixture
+def random_path(tmp_path):
+    def wrapper(extension=None):
+        extension = "" if not extension else f".{extension}"
+        return tmp_path / f"{uuid4()}{extension}"
+    return wrapper 
+
+@pytest.fixture
+def random_bam_path(random_path):
+    def wrapper():
+        return random_path(".bam")
+    return wrapper
+
+
+@pytest.fixture
+def sam_to_bam(random_bam_path):
+    def wrapper(sam_file):
+        out_path = random_bam_path()
+        with pysam.AlignmentFile(sam_file, "r") as infile, \
+            pysam.AlignmentFile(out_path, "wb", template=infile) as outfile:
+            for s in infile:
+                outfile.write(s)
+        infile.close()
+        return out_path
+    return wrapper
+
+
+def test_generate_well_statistics_simple_bam(run_component, input_sam_path, sam_to_bam, random_path):
+    bam_file = sam_to_bam(input_sam_path)
+    processed_bam = random_path("tsv")
+    reads_per_chromosome = random_path("tsv")
+    nr_reads_nr_umis_per_cb = random_path("tsv")
+    top_onehundred_umis = random_path("tsv")
+    run_component([
+        "--input", bam_file,
+        "--processedBAMFile", processed_bam,
+        "--nrReadsNrGenesPerChrom", reads_per_chromosome,
+        "--nrReadsNrUMIsPerCB", nr_reads_nr_umis_per_cb,
+        "--umiFreqTop", top_onehundred_umis,
+        "--barcode", "ACGT",
+        "--well_id", "A1",
+    ])
+    for file_path in (processed_bam, reads_per_chromosome,
+                      nr_reads_nr_umis_per_cb, top_onehundred_umis):
+        assert file_path.is_file()
+
+    expected_processed_bam = \
+    dedent("""\
+    WellBC	WellID	Chr	CB	UB	GX	GN
+    ACGT	A1	1	ACA	CGG	gene1	gene1
+    ACGT	A1	1	ACA	CGG	gene1	gene1
+    ACGT	A1	2	GGG	GTT	gene2	gene2
+    ACGT	A1	2	GGG	GTC	gene3	gene3
+    """)
+
+    expected_reads_per_chromosome = \
+    dedent("""\
+    WellBC	WellID	Chr	NumberOfReads	NumberOfGenes
+    ACGT	A1	1	2	1
+    ACGT	A1	2	2	2
+    """)
+
+    expected_nr_reads_nr_umis_per_cb = \
+    dedent("""\
+    WellBC	WellID	CB	NumberOfReads	nrUMIs
+    ACGT	A1	ACA	2	1
+    ACGT	A1	GGG	2	2
+    """)
+
+    expected_top_onehundred_umis = \
+    dedent("""\
+    WellBC	WellID	UB	N
+    ACGT	A1	CGG	2
+    ACGT	A1	GTC	1
+    ACGT	A1	GTT	1
+    """)
+
+    assert_file_content_equals(processed_bam, expected_processed_bam)
+    assert_file_content_equals(reads_per_chromosome, expected_reads_per_chromosome)
+    assert_file_content_equals(nr_reads_nr_umis_per_cb, expected_nr_reads_nr_umis_per_cb)
+    assert_file_content_equals(top_onehundred_umis, expected_top_onehundred_umis)
+
+
+if __name__ == '__main__':
+    sys.exit(pytest.main([__file__]))
--- a/src/stats/generate_well_statistics/test.sam
+++ b/src/stats/generate_well_statistics/test.sam
@@ -0,0 +1,7 @@
+@HD	VN:1.4	SO:coordinate
+@SQ	SN:1	LN:200
+@SQ	SN:2	LN:50
+test_1	16	1	22	255	1M	*	0	0	C	I	NH:i:1	HI:i:1	nM:i:0	AS:i:47	CR:Z:ACA	UR:Z:CGG	GX:Z:gene1	GN:Z:gene1	CB:Z:ACA	UB:Z:CGG
+test_2	16	1	22	255	1M	*	0	0	G	!	NH:i:1	HI:i:1	nM:i:0	AS:i:47	CR:Z:ACA	UR:Z:CGG	GX:Z:gene1	GN:Z:gene1	CB:Z:ACA	UB:Z:CGG
+test_3	0	2	40	255	1M	*	0	0	T	!	NH:i:1	HI:i:1	nM:i:0	AS:i:47	CR:Z:GGG	UR:Z:GTT	GX:Z:gene2	GN:Z:gene2	CB:Z:GGG	UB:Z:GTT
+test_4	0	2	60	255	1M	*	0	0	C	!	NH:i:1	HI:i:1	nM:i:0	AS:i:47	CR:Z:GGG	UR:Z:GTC	GX:Z:gene3	GN:Z:gene3	CB:Z:GGG	UB:Z:GTC
--- a/src/workflows/htrnaseq/config.vsh.yaml
+++ b/src/workflows/htrnaseq/config.vsh.yaml
@@ -0,0 +1,126 @@
+name: htrnaseq
+namespace: workflows
+authors:
+  - __merge__: /src/base/authors/dries_schaumont.yaml
+    roles: [ maintainer ]
+argument_groups:
+  - name: Input arguments
+    arguments:
+      - name: --input_r1
+        description: |
+          Forward reads in FASTQ format. Multiple files can be provided which will
+          be demultiplexed separately before joining the results for each individual well.
+        type: file
+        required: true
+        multiple: true
+      - name: --input_r2
+        description: |
+          Reverse reads in FASTQ format. Multiple files can be provided which will
+          be demultiplexed separately before joining the results for each individual well.
+        type: file
+        required: true
+        multiple: true
+      - name: --barcodesFasta
+        type: file
+        required: true
+      - name: --genomeDir
+        type: file
+        required: true
+      - name: --annotation
+        type: file
+        required: true
+  - name: Output arguments
+    arguments:
+      - name: --fastq_output_r1
+        description: List of demultiplexed fastq files
+        type: file
+        direction: output
+        multiple: true
+        required: true
+        default: "fastq/*_R1_001.fastq"
+      - name: --fastq_output_r2
+        description: List of demultiplexed fastq files
+        type: file
+        direction: output
+        multiple: true
+        required: true
+        default: "fastq/*_R2_001.fastq"
+      - name: --star_output
+        description: Output from mapping with STAR
+        type: file
+        direction: output
+        multiple: true
+        required: true
+        default: $id/star/*
+      - name: "--nrReadsNrGenesPerChrom"
+        type: file
+        direction: output
+        required: true
+        default: "nrReadsNrGenesPerChrom.$id.txt"
+      - name: "--star_qc_metrics"
+        type: file
+        direction: output
+        required: true
+        default: "starLogs.$id.txt"
+      - name: "--eset"
+        type: file
+        direction: output
+        required: true
+        default: eset.$id.rds
+      - name: "--f_data"
+        type: file
+        direction: output
+        required: true
+        default: fData.$id.tsv
+      - name: "--p_data"
+        type: file
+        direction: output
+        required: true
+        default: pData.$id.tsv
+      - name: "--html_report"
+        type: file
+        direction: output
+        required: true
+        default: report.html
+resources:
+  - type: nextflow_script
+    path: main.nf
+    entrypoint: run_wf
+test_resources:
+  - type: nextflow_script
+    path: test.nf
+    entrypoint: test_wf
+dependencies:
+  - name: stats/combine_star_logs
+    repository: local
+  - name: stats/generate_pool_statistics
+    repository: local
+  - name: stats/generate_well_statistics
+    repository: local
+  - name: workflows/well_demultiplex
+    repository: local
+  - name: workflows/parallel_map_wf
+    repository: local
+  - name: workflows/utils/groupWells
+    repository: local
+  - name: eset/create_eset
+    repository: local
+  - name: eset/create_fdata
+    repository: local
+  - name: eset/create_pdata
+    repository: local
+  - name: report/create_report
+    repository: local
+repositories:
+  - name: local
+    type: local
+  - name: bb
+    type: vsh
+    repo: biobox
+    tag: v0.1.0
+
+runners:
+  - type: nextflow
+
+engines:
+  - type: native
--- a/src/workflows/htrnaseq/integration_test.sh
+++ b/src/workflows/htrnaseq/integration_test.sh
@@ -0,0 +1,21 @@
+#!/bin/bash
+
+# get the root of the directory
+REPO_ROOT=$(git rev-parse --show-toplevel)
+
+# ensure that the command below is run from the root of the repository
+cd "$REPO_ROOT"
+
+# Make sure the workflow is built
+viash ns build --setup cb --parallel
+
+export NXF_VER=24.04.4
+
+nextflow \
+  run . \
+  -main-script src/workflows/htrnaseq/test.nf \
+  -config ./src/config/labels.config \
+  -entry test_wf \
+  -resume \
+  -profile docker,local \
+  --publish_dir output
--- a/src/workflows/htrnaseq/main.nf
+++ b/src/workflows/htrnaseq/main.nf
@@ -0,0 +1,225 @@
+workflow run_wf {
+  take:
+    input_ch
+
+  main:
+    // The featureData only has one requirement: the genome annotation.
+    // It can be generated straight away.
+    f_data_ch = input_ch
+      | create_fdata.run(
+        directives: [label: ["lowmem", "lowcpu"]],
+        fromState: ["gtf": "annotation"],
+        toState: {id, result, state -> ["f_data": result.output]}
+      )
+
+    // Perform mapping of each well. The input here are events per pool,
+    // the output channel is one event per well.
+    mapping_ch = input_ch
+      | well_demultiplex.run(
+        fromState: [
+            "input_r1": "input_r1",
+            "input_r2": "input_r2",
+            "barcodesFasta": "barcodesFasta",
+        ],
+        toState: { id, result, state ->
+          def filtered_input = state.findAll{!["input_r1", "input_r2"].contains(it.key)} 
+          def filtered_results = result.findAll{!["output_r1", "output_r2"].contains(it.key)} 
+          def new_state = filtered_input + filtered_results + [ 
+            "fastq_output_r1": result.output_r1, 
+            "fastq_output_r2": result.output_r2,
+          ]
+          return new_state
+        }
+      )
+      | parallel_map_wf.run(
+        fromState: {id, state ->
+          [
+            "input_r1": state.fastq_output_r1[0],
+            "input_r2": state.fastq_output_r2[0],
+            "barcode": state.barcode,
+            "pool": state.pool,
+            "output": state.star_output[0],
+            "genomeDir": state.genomeDir,
+          ]
+        },
+        toState: ["star_output": "output"]
+      )
+
+    // From the mapped wells, create statistics based on the BAM file
+    // and join the events back to pool level.
+    pool_ch = mapping_ch
+      | generate_well_statistics.run(
+        directives: [label: ["verylowmem", "verylowcpu"]],
+        fromState: { id, state ->
+          [
+            "input": state.star_output.resolve('Aligned.sortedByCoord.out.bam'),
+            "barcode": state.barcode,
+            "well_id": state.well_id,
+          ]
+        },
+        toState: [
+          "nrReadsNrGenesPerChrom": "nrReadsNrGenesPerChrom",
+        ]
+      )
+      | map {id, state ->
+        // Create a special groupKey, such that groupTuple
+        // knows when all the barcodes have been grouped into 1 event.
+        // This way the processing is as distributed as possible.
+        def key = groupKey(state.pool, state.n_wells)
+        def newEvent = [key, state]
+        return newEvent
+      }
+      // Use a custom sorting function because sort: 'hash'
+      // requires a hash to be calculated on every entry of the state
+      // This is inefficient when the number of events is large 
+      // (i.e large number or barcodes).
+      // Sorting on lexographical order of the barcode is sufficient here.
+      | groupTuple(sort: {a, b -> a.barcode <=> b.barcode})
+      | map {id, states ->
+        // Gather the keys from all states. for some state items,
+        // we need gather all the different items from across the states
+        def barcodes = states.collect{it.barcode}
+        assert barcodes.clone().unique().size() == barcodes.size(), \
+          "Error when gathering information for pool ${id}, barcodes are not unique!"
+        def well_ids = states.collect{it.well_id}
+        assert well_ids.clone().unique().size() == well_ids.size(), \
+          "Error when gathering information for pool ${id}, well IDs are not unique!"
+        def custom_state = [
+          "fastq_output_r1": states.collect{it.fastq_output_r1[0]},
+          "fastq_output_r2": states.collect{it.fastq_output_r2[0]},
+          "barcode": barcodes,
+          "well_id": well_ids,
+          "star_output": states.collect{it.star_output},
+          "nrReadsNrGenesPerChrom": states.collect{it.nrReadsNrGenesPerChrom},
+        ]
+        //For many state items, the value is the same across states.
+        def other_state_keys = states.inject([].toSet()){ current_keys, state ->
+            def new_keys = current_keys + state.keySet()
+            return new_keys
+          }.minus(custom_state.keySet())
+        // All other state should have a unique value
+        def old_state_items = other_state_keys.inject([:]){ old_state, argument_name ->
+            argument_values = states.collect{it.get(argument_name)}.unique()
+            assert argument_values.size() == 1, "Arguments should be the same across modalities. Please report this \
+                                                 as a bug. Argument name: $argument_name, \
+                                                 argument value: $argument_values"
+            def argument_value
+            argument_values.each { argument_value = it }
+            def current_state = old_state + [(argument_name): argument_value]
+            return current_state
+          }
+
+        def new_state = custom_state + old_state_items
+        [id.getGroupTarget(), new_state]
+      }
+
+    // The well statistics are merged on pool level. 
+    pool_statistics_ch = pool_ch
+      | generate_pool_statistics.run(
+        directives: ["label": ["lowmem", "verylowcpu"]],
+        fromState: [
+          "nrReadsNrGenesPerChrom": "nrReadsNrGenesPerChrom",
+        ],
+        toState: [
+          "nrReadsNrGenesPerChromPool": "nrReadsNrGenesPerChromPool"
+        ]
+      )
+
+    // The statistics from the STAR logs of different wells are joined
+    // on pool level 
+    star_logs_ch = pool_ch
+      | combine_star_logs.run(
+        directives: ["label": ["lowmem", "verylowcpu"]],
+        fromState: {id, state -> [
+            "star_logs": state.star_output.collect{it.resolve("Log.final.out")},
+            "gene_summary_logs": state.star_output.collect{it.resolve("Solo.out/Gene/Summary.csv")},
+            "reads_per_gene_logs": state.star_output.collect{it.resolve("ReadsPerGene.out.tab")},
+            "barcodes": state.barcode,
+            "output": state.star_qc_metrics
+          ]
+        },
+        toState: [
+          "star_qc_metrics": "output",
+        ]
+      )
+    
+    p_data_ch = star_logs_ch.join(pool_statistics_ch, remainder: true)
+      | map {id, star_logs_state, pool_statistics_state ->
+        def newState = star_logs_state + ["nrReadsNrGenesPerChromPool": pool_statistics_state.nrReadsNrGenesPerChromPool]
+        return [id, newState]
+      }
+      | create_pdata.run(
+        directives: [label: ["lowmem", "lowcpu"]],
+        fromState: [
+          "star_stats_file": "star_qc_metrics",
+          "nrReadsNrGenesPerChromPool": "nrReadsNrGenesPerChromPool",
+        ],
+        toState: ["p_data": "output"],
+      )
+
+    eset_ch = p_data_ch.join(f_data_ch, remainder: true)
+      | map {id, p_data_state, f_data_state ->
+        def newState = p_data_state + ["f_data": f_data_state["f_data"]]
+        [id, newState]
+      }
+      | create_eset.run(
+        directives: [label: ["lowmem", "lowcpu"]],
+        fromState: [
+          "pDataFile": "p_data",
+          "fDataFile": "f_data",
+          "mappingDir": "star_output",
+          "output": "eset",
+          "barcodes": "barcode",
+          "poolName": "pool",
+        ],
+        toState: [
+          "eset": "output",
+        ]
+      )
+
+    report_channel = eset_ch
+      | toSortedList()
+      | map {ids_and_states ->
+        def states = ids_and_states.collect{it[1]}
+        def html_report = states[0].html_report
+        def ids = ids_and_states.collect{it[0]}
+        def esets = states.collect{it.eset}
+        ["report", ["esets": esets, "html_report": html_report, "original_ids": ids]]
+      }
+      | create_report.run(
+        fromState: [
+          "eset": "esets",
+          "output_report": "html_report",
+        ],
+        toState: [
+          "html_report": "output_report"
+        ]
+      )
+      | flatMap {id, state ->
+        state.original_ids.collect{original_id ->
+          [original_id, ["html_report": state.html_report]]
+        }
+      }
+
+    output_ch = eset_ch.join(report_channel)
+      | map {id, state_eset, state_report ->
+        def new_state = state_eset + ["html_report": state_report.html_report]
+        [id, new_state]
+      }
+      | setState([
+        "star_output": "star_output", 
+        "fastq_output_r1": "fastq_output_r1",
+        "fastq_output_r2": "fastq_output_r2",
+        "star_output": "star_output",
+        "nrReadsNrGenesPerChrom": "nrReadsNrGenesPerChromPool",
+        "star_qc_metrics": "star_qc_metrics",
+        "eset": "eset",
+        "f_data": "f_data",
+        "p_data": "p_data",
+        "html_report": "html_report",
+      ])
+
+
+  emit:
+    output_ch
+}
--- a/src/workflows/htrnaseq/nextflow.config
+++ b/src/workflows/htrnaseq/nextflow.config
@@ -0,0 +1,8 @@
+
+params {
+  rootDir = java.nio.file.Paths.get("$projectDir/../../../").toAbsolutePath().normalize().toString()
+}
+
+
+// include common settings
+includeConfig("${params.rootDir}/src/config/labels.config")
--- a/src/workflows/htrnaseq/test.nf
+++ b/src/workflows/htrnaseq/test.nf
@@ -0,0 +1,45 @@
+nextflow.enable.dsl=2
+targetDir = params.rootDir + "/target/nextflow"
+
+include { htrnaseq } from targetDir + "/workflows/htrnaseq/main.nf"
+include { check_eset } from targetDir + "/integration_test_components/htrnaseq/check_eset/main.nf"
+
+
+params.resources_test =  "gs://viash-hub-test-data/htrnaseq/v1/"
+
+workflow test_wf {
+  resources_test_file = file(params.resources_test)
+  input_ch = Channel.fromList([
+      [
+          id: "sample_one",
+          input_r1: resources_test_file.resolve("100k/SRR14730301/VH02001612_S9_R1_001.fastq"),
+          input_r2: resources_test_file.resolve("100k/SRR14730301/VH02001612_S9_R2_001.fastq"),
+          genomeDir: resources_test_file.resolve("genomeDir/gencode.v41.star.sparse"),
+          barcodesFasta: resources_test_file.resolve("360-wells-with-ids.fasta"),
+          annotation: resources_test_file.resolve("genomeDir/gencode.v41.annotation.gtf.gz")
+      ],
+      [
+          id: "sample_two",
+          input_r1: resources_test_file.resolve("100k/SRR14730302/VH02001614_S8_R1_001.fastq"),
+          input_r2: resources_test_file.resolve("100k/SRR14730302/VH02001614_S8_R2_001.fastq"),
+          genomeDir: resources_test_file.resolve("genomeDir/gencode.v41.star.sparse"),
+          barcodesFasta: resources_test_file.resolve("360-wells-with-ids.fasta"),
+          annotation: resources_test_file.resolve("genomeDir/gencode.v41.annotation.gtf.gz")
+      ]
+    ])
+    | map{ state -> [state.id, state] }
+    | view { "Input: $it" }
+    | htrnaseq.run(
+        toState: [
+            "eset": "eset",
+            "star_output": "star_output",
+        ]
+    )
+    | check_eset.run(
+        runIf: {id, state -> id == "sample_one"},
+        toState: [
+            "eset": "eset",
+            "star_output": "star_output"
+        ]
+    )
+}
--- a/src/workflows/parallel_map_wf/config.vsh.yaml
+++ b/src/workflows/parallel_map_wf/config.vsh.yaml
@@ -0,0 +1,60 @@
+name: parallel_map_wf
+namespace: workflows
+description: |
+  Map RNA sequencing data, provided as fastq files (paired-end) to a reference genome using STAR Solo.
+  Input data must have been demultiplexed beforehand, meaning that a single fastq pair provides data for
+  one barcode (one well). Multiple wells can be mapped in parallel by providing multiple events to the 
+  workflow. Output is provided as mapped output per pool, i.e. one output is provided per pool.
+authors:
+  - __merge__: /src/base/authors/dries_schaumont.yaml
+    roles: [ maintainer ]
+argument_groups:
+  - name: "Arguments"
+    arguments:
+      - name: "--input_r1"
+        type: file
+        direction: input
+        required: true
+      - name: "--input_r2"
+        type: file
+        direction: input
+        required: true
+      - name: "--barcode"
+        type: string
+        required: true
+      - name: "--pool"
+        type: string
+        required: true
+      - name: "--genomeDir"
+        type: file
+        required: true
+        direction: input
+      - name: "--output"
+        type: file
+        direction: output
+        required: true
+resources:
+  - type: nextflow_script
+    path: main.nf
+    entrypoint: run_wf
+
+# test_resources:
+#   - type: nextflow_script
+#     path: test.nf
+#     entrypoint: test_wf
+
+dependencies:
+  - name: parallel_map
+    repository: local
+  - name: workflows/utils/groupWells
+    repository: local
+repositories:
+  - name: local
+    type: local
+
+runners:
+  - type: nextflow
+
+engines:
+  - type: native
+
--- a/src/workflows/parallel_map_wf/main.nf
+++ b/src/workflows/parallel_map_wf/main.nf
@@ -0,0 +1,74 @@
+workflow run_wf {
+    take:
+    input_ch
+
+    main:
+    pool_ch = input_ch
+      | groupWells.run(
+        fromState: [
+          "input_r1": "input_r1",
+          "input_r2": "input_r2",
+          "well": "barcode",
+          "pool": "pool",
+        ],
+        toState: [
+          "wells": "wells",
+          "input_r1": "output_r1",
+          "input_r2": "output_r2",
+        ]
+      )
+      | parallel_map.run(
+        fromState: { id, state ->
+         [
+           "input_r1": state.input_r1,
+           "input_r2": state.input_r2,
+           "genomeDir": state.genomeDir,
+           "barcodes": state.wells,
+           "pool": state.pool,
+           "wellBarcodesLength": 10,
+           "umiLength": 10,
+           "output": state.output,
+         ]
+        },
+        toState: ["output": "output"],
+        directives: ["label": ["highmem", "lowcpu"]],
+      )
+      | setState(["output", "pool"])
+
+    // input_ch is on pool level, while parallel_map
+    // outputs multiple events per pool. 
+    // Join the results back to pool level
+    input_join_ch = input_ch
+      | map {id, state ->
+        def newEvent = [state.pool, id, state]
+        return newEvent
+      }
+
+    output_ch = input_join_ch.combine(pool_ch, by: 0)
+      | map {pool, well_id, state_well, state_pool ->
+        def well_output = state_pool.output.findAll{star_output_dir ->
+          def barcodes_list = []
+          // Get the barcode from the STAR file. 
+          // One STAR output contains the results for one
+          // well barcode. We can look for the barcode in
+          // the 'Solo.out/Gene/raw/barcode.tsv' file.
+          def barcodes_files = files("${star_output_dir}/Solo.out/Gene/raw/barcodes.tsv")
+          assert barcodes_files.size() == 1, \
+            "Exactly one file should have matched the barcodes files (found: $barcodes_files)."
+          def barcode
+          barcodes_files.each{ it ->
+            assert it.countLines() == 1,
+              "Expected only one barcode in a single STAR output."
+            barcode = it.text.trim()
+          }
+          return barcode == state_well.barcode
+        }
+        assert well_output.size() == 1, \
+          "Two or more outputs from the mapping seemed to have processed barcode '$barcode'."
+        [well_id, ["output": well_output[0]]]
+      }
+
+
+    emit:
+    output_ch
+}
--- a/src/workflows/utils/groupWells/config.vsh.yaml
+++ b/src/workflows/utils/groupWells/config.vsh.yaml
@@ -0,0 +1,56 @@
+name: groupWells
+namespace: workflows/utils
+description: |
+  N/A
+authors:
+  - __merge__: /src/base/authors/dries_schaumont.yaml
+    roles: [ maintainer ]
+argument_groups:
+  - name: Inputs
+    arguments:
+      - name: "--well"
+        type: string
+        description: Barcode identifier for a well
+        required: true
+        example: barcode_1
+      - name: "--pool"
+        type: string
+        description: Identifier of the pool
+        required: true
+        example: pool_1
+      - name: "--input_r1"
+        type: file
+        description: Path to the input for R1
+        required: true
+        example: input.fastq.gz
+      - name: "--input_r2"
+        type: file
+        description: Path to the input for R1
+        required: true
+        example: input.fastq.gz
+
+  - name: Output
+    arguments:
+      - name: "--wells"
+        type: string
+        description: List of grouped wells (by means of barcodes)
+        multiple: true
+        direction: output
+        example: input.fastq.gz
+      - name: "--output_r1"
+        type: file
+        description: Path to output for R2
+        multiple: true
+        direction: output
+      - name: "--output_r2"
+        type: file
+        description: Path to the output for R2
+        multiple: true
+        direction: output
+resources:
+  - type: nextflow_script
+    path: main.nf
+    entrypoint: run_wf
+
+runners:
+  - type: nextflow
--- a/src/workflows/utils/groupWells/main.nf
+++ b/src/workflows/utils/groupWells/main.nf
@@ -0,0 +1,25 @@
+workflow run_wf {
+
+  take: in_
+
+  main:
+
+    out_ = in_
+      | map{ id, state -> [state.pool, state, id]}
+      | groupTuple(sort: "hash")
+      | map{ new_id, inputs, original_ids ->
+        [
+          new_id,
+          [
+            output_r1: inputs.collect{it.input_r1},
+            output_r2: inputs.collect{it.input_r2},
+            wells: inputs.collect{it.well},
+            _meta: [ join_id: original_ids[0] ]
+          ]
+        ]
+      }
+
+  emit: out_
+
+}
+
--- a/src/workflows/well_demultiplex/config.vsh.yaml
+++ b/src/workflows/well_demultiplex/config.vsh.yaml
@@ -0,0 +1,99 @@
+name: well_demultiplex
+namespace: workflows
+description: Demultiplexing on well level
+authors:
+  - __merge__: /src/base/authors/dries_schaumont.yaml
+    roles: [ maintainer ]
+  - __merge__: /src/base/authors/marijke_van_moerbeke.yaml
+    roles: [ contributor ]
+argument_groups:
+  - name: Input arguments
+    arguments:
+      - name: --input_r1
+        description: |
+          Forward reads in FASTQ format. Multiple files can be provided which will
+          be demultiplexed separately before joining the results for each individual well.
+        type: file
+        required: true
+        multiple: true
+      - name: --input_r2
+        description: |
+          Reverse reads in FASTQ format. Multiple files can be provided which will
+          be demultiplexed separately before joining the results for each individual well.
+        type: file
+        required: true
+        multiple: true
+      - name: --barcodesFasta
+        type: file
+        required: true
+  - name: Output arguments
+    arguments:
+      - name: --output_r1
+        description: List of demultiplexed fastq files
+        type: file
+        direction: output
+        multiple: true
+        required: true
+        default: "fastq/*_R1_001.fastq"
+      - name: "--output_r2"
+        description: List of demultiplexed fastq files
+        type: file
+        direction: output
+        multiple: true
+        required: true
+        default: "fastq/*_R2_001.fastq" 
+      - name: "--pool"
+        type: string
+        description: The original pool / sample name
+        direction: output
+      - name: "--well_id"
+        type: string
+        direction: output
+      - name: "--barcode"
+        type: string
+        direction: output
+      - name: "--lane"
+        type: string
+        direction: output
+      - name: "--pair_end"
+        type: string
+        direction: output
+      - name: "--n_wells"
+        type: integer
+        direction: output
+        description: The number of wells in the pool is well is a part of.
+resources:
+  - type: nextflow_script
+    path: main.nf
+    entrypoint: run_wf
+
+# Test dataset: https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc=GSM5357044
+test_resources:
+  - type: nextflow_script
+    path: test.nf
+    entrypoint: test_wf
+  - type: nextflow_script
+    path: test.nf
+    entrypoint: test_wf2
+
+dependencies:
+  - name: cutadapt
+    repository: bb
+  - name: concat_text
+    repository: cb
+repositories:
+  - name: bb
+    type: vsh
+    repo: biobox
+    tag: v0.2.0
+  - name: cb
+    type: vsh
+    repo: craftbox
+    tag: v0.1.0
+
+runners:
+  - type: nextflow
+
+engines:
+  - type: native
+
--- a/src/workflows/well_demultiplex/integration_test.sh
+++ b/src/workflows/well_demultiplex/integration_test.sh
@@ -0,0 +1,32 @@
+#!/bin/bash
+
+# get the root of the directory
+REPO_ROOT=$(git rev-parse --show-toplevel)
+
+# ensure that the command below is run from the root of the repository
+cd "$REPO_ROOT"
+
+# Make sure the workflow is built
+viash ns build --setup cb --parallel
+
+export NXF_VER=24.04.4
+
+nextflow \
+  run . \
+  -main-script src/workflows/well_demultiplex/test.nf \
+  -config ./src/config/labels.config \
+  -entry test_wf \
+  -resume \
+  -profile docker,local \
+  --publish_dir output
+
+
+nextflow \
+  run . \
+  -main-script src/workflows/well_demultiplex/test.nf \
+  -config ./src/config/labels.config \
+  -entry test_wf2 \
+  -resume \
+  -profile docker,local \
+  --publish_dir output_2 \
+
--- a/src/workflows/well_demultiplex/main.nf
+++ b/src/workflows/well_demultiplex/main.nf
@@ -0,0 +1,267 @@
+workflow run_wf {
+  take:
+    input_ch
+
+  main:
+    output_ch = input_ch
+      /*
+      Parse the fasta file containing the barcodes and do the following:
+        - The sequence headers must not contain any whitespaces
+        - The headers (Well IDs) must be unique
+        - The barcodes must be unique
+        - Store the number of barcodes in the state
+        - Add a barcode to well ID (header) mapping to the state,
+          in order to be able to retreive the well ID based on the FASTQ name after well demultiplexing
+      */
+      | map {id, state ->
+        def n_wells = state.barcodesFasta.countFasta() as int
+        // The header is the full header, the id is the part header up to the first whitespace character
+        // We do not allow whitespace in the header of the fasta file, so assert this.
+        def fasta_entries = state.barcodesFasta.splitFasta(
+          record: ["id": true, "header": true, "seqString": true]
+        )
+        assert fasta_entries.every{it.id == it.header}, \
+          "The barcodes FASTA headers must not contain any whitespace!"
+        // Check if the fasta headers are unique
+        def fasta_ids = fasta_entries.collect{it.id}
+        assert fasta_ids.clone().unique() == fasta_ids, \
+          "The barcodes FASTA entries must have a unique name!"
+        // Check if the sequences are unique
+        def fasta_sequences = fasta_entries.collect{it.seqString}
+        assert fasta_sequences.clone().unique() == fasta_sequences, \
+          "The barcodes FASTA sequences must be unique!"
+        def well_id_matcher = /^([A-Za-z]+)0*([1-9]?[0-9]+)$/
+        def entries_corrected_id = fasta_entries.collectEntries { it ->
+          def unformatted_id = it.header
+          def id_matched_to_format = unformatted_id =~ well_id_matcher
+          assert (id_matched_to_format && id_matched_to_format.getCount() == 1), \
+            "The FASTA headers must match the coordinate system of a well plate (e.g. A01, B01, ... or AA1, AB1, ...). Found: ${unformatted_id}"
+          def id_letters = id_matched_to_format[0][1].toUpperCase()
+          def id_numbers = id_matched_to_format[0][2]
+          ["${id_letters}${id_numbers}", it.seqString]
+        }
+        def newState = state + [
+          "n_wells": n_wells,
+          "well_id_barcode_mapping": entries_corrected_id,
+        ]
+        [id, newState]
+      }
+      /*
+      For each pool (i.e. event) in the channel, a list of R1 and R2 input
+      reads is provided which correspond to the lanes. If there are multiple lanes,
+      we can demultiplex into the wells for each lane in parallel. Therefore, cutadapt
+      must be started multiple times and we need an event per lane. The events are
+      created by taking the R1 and R2 pairs from the input lists. The index of the elements
+      in these lists are added to the ID in order to make them unique.
+      */
+      | flatMap {id, state ->
+        assert state.input_r1.size() == state.input_r2.size(), \
+          "Expected equal number of inputs for R1 and R2"
+        // Store the number of lanes that were encountered here in order to
+        // group them together in an asynchronous manner later by providing
+        // the expected number of events to be grouped to groupTuple.
+        // see https://www.nextflow.io/docs/latest/reference/operator.html#grouptuple
+        def n_lanes = state.input_r1.size()
+        [state.input_r1, state.input_r2].transpose().withIndex().collect{ input_pair, index ->
+          def single_input_r1 = input_pair[0]
+          def single_input_r2 = input_pair[1]
+          def newState = state + ["input_r1": single_input_r1,
+                                  "input_r2": single_input_r2,
+                                  "pool": id,
+                                  "lane_sorting": index,
+                                  "n_lanes": n_lanes]
+          def newId = id + "_" + index
+          [newId, newState]
+        }
+      }
+      | cutadapt.run(
+        directives: [label: ["highmem", "midcpu"]],
+        fromState: { id, state ->
+          def new_output = ("fastq_${state.lane_sorting}/*_001.fastq")
+          [
+            input: state.input_r1,
+            input_r2: state.input_r2,
+            no_indels: true,
+            action: "none",
+            front_fasta: state.barcodesFasta,
+            output: new_output,
+            error_rate: 0.10,
+            demultiplex_mode: "single",
+          ]
+        },
+        toState: { id, result, state ->
+          def newState = [
+            "pool": state.pool,
+            "n_lanes": state.n_lanes,
+            "output": result.output,
+            "lane_sorting": state.lane_sorting,
+            "n_wells": state.n_wells,
+            "well_id_barcode_mapping": state.well_id_barcode_mapping,
+          ]
+          return newState
+        }
+      )
+      // Parse the file names to obtain metadata about the output
+      | flatMap{ id, state ->
+        def pool = state.pool
+        state.output.collect{ p ->
+          def well_id_matcher = p =~ /.*\\/([A-Za-z0-9]*|unknown)_R?.*/
+          assert well_id_matcher, \
+            "Could not find Well ID in the name of FASTQ file ($p) output from cutadapt."
+          def well_id = well_id_matcher[0][1]
+          // Note: set the barcode to 'null' for reads that were put into 'unknown'
+          def barcode = (well_id != "unknown") ? state.well_id_barcode_mapping[well_id].replaceAll("[^ACGTacgt]", "") : null
+          assert (well_id == "unknown") || (barcode != null), \
+            "After demultiplexing, no Well ID could be retreived for barcode ${barcode}."
+          def pair_end_matcher = p =~ /.*_(R[12])_.*/
+          assert pair_end_matcher, \
+            "Could not find read orientation information in the name of the FASTQ file ($p) output from cutadapt."
+          def pair_end = pair_end_matcher[0][1]
+          def lane_matcher = p =~ /.*_(L\d+).*/
+          def lane = lane_matcher ? lane_matcher[0][1] : "NA"
+          def new_id = pool + "__" + well_id
+          [
+            new_id,
+            [
+              "pool": pool,
+              "barcode": barcode,
+              "well_id": well_id,
+              "output": p,
+              "lane": lane,
+              "n_wells": state.n_wells,
+              "pair_end": pair_end,
+              "n_lanes": state.n_lanes,
+              "lane_sorting": state.lane_sorting,
+              "_meta": [ "join_id": pool ]
+            ]
+          ]
+        }
+      }
+      /*
+      At this point, the events are provided on the smallest possible level,
+      as each event represents the reads for a certain orientation from a
+      particular lane and a single well. Here, we join these events back together
+      on well level, gathering FASTQS across the lanes and read orientations.
+      In order to make this joining as efficient as possible, the number of
+      lanes which are expected to be gathered were stored in the state earlier.
+      This way, the processing of a well can continue as as soon as all of
+      the lanes have been gathered. The number of lanes times 2 (forward
+      and reverse orientation) represents the total number of FASTQS (events)
+      to be included for a certain well.
+      */
+      | map {id, state ->
+          def group_key = groupKey(id, state.n_lanes * 2)
+          return [group_key, state]
+      }
+      | groupTuple(sort: {a, b ->
+        // Make sure that the grouped states are in order,
+        // meaning forward and reverse FASTQs are paired and the FASTQ
+        // for the forward reads comes before the reverse reads FASTQ.
+        if (a.lane_sorting == b.lane_sorting) {
+          return a.pair_end <=> b.pair_end
+        }
+        return a.lane_sorting <=> b.lane_sorting
+      })
+      | map {_, states ->
+        // The states are in one long flat list, group them into pairs
+        // This assumes that the FASTQ files are already in order!
+        // (See the 'sort' argument of groupTuple above)
+        def output_pairs = states.collate(2)
+
+        // Sanity check the state
+        output_pairs.each{ pair ->
+          assert pair.size() == 2, \
+            "State error: expected FASTQ pairs as output from cutadapt, " +
+            "found output state: $pair"
+          def (first, second) = pair
+          def should_be_the_same = [
+            "barcode",
+            "well_id",
+            "lane",
+            "pool",
+            "lane_sorting",
+          ]
+          should_be_the_same.each { attr_to_check ->
+            first_attr = first.get(attr_to_check)
+            second_attr = second.get(attr_to_check)
+            assert first_attr == second_attr, \
+              "State error: expected FASTQ pairs from cutadapt to have " +
+              "the same detected ${attr_to_check}. Found: " +
+              "$first_attr and $second_attr"
+          }
+          // Forward and reverse reads should be designated
+          // by 'R1' and 'R2', and sorted lexographically.
+          assert first.pair_end == "R1", \
+            "State error: expected first item from FASTQ pair to have " +
+             "orientation 'R1', found $first.pair_end"
+          assert second.pair_end == "R2", \
+            "State error: expected second item from FASTQ pair to have " +
+             "orientation 'R2', found $second.pair_end"
+        }
+
+        def r1_output = output_pairs.collect{it[0].output}
+        def r2_output = output_pairs.collect{it[1].output}
+        assert r1_output.size() == r2_output.size()
+
+        /* The lane sorting represents the order of the FASTQ files
+           as provided by the input. The order of the FASTQ files should
+           remain the same in the well output. This is because the result of STAR
+           can differ based on the order of the reads in the FASTQ file.
+           Even when the same reads are provided, the order of them matters.
+        */
+        def lane_sorting = output_pairs.it[0].lane_sorting
+        def sorting_is_monotonically_increasing = lane_sorting.withIndex().every { i, idx ->
+          idx == 0 || lane_sorting[idx - 1] <= i
+        }
+        assert sorting_is_monotonically_increasing, \
+          "State error: expected the order of the FASTQ files after grouping " +
+          "the cutadapt output to be the same as the order in the input. " +
+          "Found sorting $lane_sorting, R1 output: $r1_output, R2 output: $r2_output."
+
+        // Here we pick the state from the first item in the list of states
+        // and overwrite the keys which are different across states
+        def first_state = states[0]
+        def new_id = first_state.pool + "__" + first_state.well_id
+        def new_state = first_state + ["output_r1": r1_output, "output_r2": r2_output]
+        [new_id, new_state]
+      }
+      // TODO: Expand this into matching a whitelist/blacklist of barcodes
+      // ... and turn into separate component
+      | filter{ id, state -> state.well_id != "unknown" }
+      | concat_text.run(
+        directives: [label: ["lowmem", "lowcpu"]],
+        key: "concat_txt_r1",
+        runIf: {id, state -> state.output_r1.size() > 1},
+        fromState: { id, state ->
+          [
+            input: state.output_r1,
+            gzip_output: false,
+            output: "${id}_R1.fastq"
+          ]
+        },
+        toState: { id, result, state ->
+          def newState = state + [ output_r1: [ result.output ] ]
+          return newState
+        }
+      )
+      | concat_text.run(
+        directives: [label: ["lowmem", "lowcpu"]],
+        key: "concat_text_r2",
+        runIf: {id, state -> state.output_r2.size() > 1},
+        fromState: { id, state ->
+          [
+            input: state.output_r2,
+            gzip_output: false,
+            output: "${id}_R2.fastq",
+          ]
+        },
+        toState: { id, result, state ->
+          def newState = state + [ output_r2: [ result.output ] ]
+          return newState
+        }
+      )
+      | setState(["pool", "well_id", "n_wells", "barcode", "lane", "_meta", "output_r1", "output_r2"])
+
+  emit:
+    output_ch
+}
--- a/src/workflows/well_demultiplex/nextflow.config
+++ b/src/workflows/well_demultiplex/nextflow.config
@@ -0,0 +1,11 @@
+manifest {
+  nextflowVersion = '!>=20.12.1-edge'
+}
+
+params {
+  rootDir = java.nio.file.Paths.get("$projectDir/../../../").toAbsolutePath().normalize().toString()
+}
+
+
+// include common settings
+includeConfig("${params.rootDir}/src/config/labels.config")
--- a/src/workflows/well_demultiplex/test.nf
+++ b/src/workflows/well_demultiplex/test.nf
@@ -0,0 +1,82 @@
+include { well_demultiplex } from params.rootDir + "/target/nextflow/workflows/well_demultiplex/main.nf"
+
+params.resources_test =  "gs://viash-hub-test-data/htrnaseq/v1/"
+
+workflow test_wf {
+  resources_test_file = file(params.resources_test)
+  output_ch = Channel.fromList([
+      [
+        id: "SRR14730301",
+        input_r1: resources_test_file.resolve("100k/SRR14730301/VH02001612_S9_R1_001.fastq"),
+        input_r2: resources_test_file.resolve("100k/SRR14730301/VH02001612_S9_R2_001.fastq"),
+        barcodesFasta: resources_test_file.resolve("2-wells-with-ids.fasta"),
+      ],
+      [
+        id: "SRR14730302",
+        input_r1: resources_test_file.resolve("100k/SRR14730302/VH02001614_S8_R1_001.fastq"),
+        input_r2: resources_test_file.resolve("100k/SRR14730302/VH02001614_S8_R2_001.fastq"),
+        barcodesFasta:  resources_test_file.resolve("2-wells-with-ids.fasta"),
+      ],
+    ])
+    | map { state -> [ state.id, state ] }
+    | well_demultiplex.run(
+      fromState: { id, state ->
+        [
+          input_r1: state.input_r1,
+          input_r2: state.input_r2,
+          barcodesFasta: state.barcodesFasta,
+        ]
+      },
+      toState: { id, output, state ->
+        output }
+    )
+    | view { output ->
+      assert output.size() == 2 : "outputs should contain two elements; [id, file]"
+      "Output: $output"
+    }
+    | toSortedList()
+    | view { output ->
+      assert output.size() == 4 : "2 samples, each with 2 barcodes"
+    }
+}
+
+
+workflow test_wf2 {
+  resources_test_file = file(params.resources_test)
+  output_ch = Channel.fromList([
+      [
+        id: "SRR14730301",
+        input_r1:
+          [
+            resources_test_file.resolve("100k/SRR14730301/VH02001612_S9_R1_001.fastq"),
+            resources_test_file.resolve("100k/SRR14730302/VH02001614_S8_R1_001.fastq"),
+          ],
+        input_r2:
+          [
+            resources_test_file.resolve("100k/SRR14730301/VH02001612_S9_R2_001.fastq"),
+            resources_test_file.resolve("100k/SRR14730302/VH02001614_S8_R2_001.fastq"),
+          ],
+        barcodesFasta: resources_test_file.resolve("2-wells-with-ids.fasta"),
+      ],
+    ])
+    | map { state -> [ state.id, state ] }
+    | well_demultiplex.run(
+      fromState: { id, state ->
+        [
+          input_r1: state.input_r1,
+          input_r2: state.input_r2,
+          barcodesFasta: state.barcodesFasta,
+        ]
+      },
+      toState: { id, output, state ->
+        output }
+    )
+    | view { output ->
+      assert output.size() == 2 : "outputs should contain two elements; [id, file]"
+      "Output: $output"
+    }
+    | toSortedList()
+    | view { output ->
+      assert output.size() == 2 : "1 samples, and two barcodes"
+    }
+}
--- a/target/.build.yaml
+++ b/target/.build.yaml
--- a/target/dependencies/vsh/vsh/biobox/v0.2.0/nextflow/cutadapt/.config.vsh.yaml
+++ b/target/dependencies/vsh/vsh/biobox/v0.2.0/nextflow/cutadapt/.config.vsh.yaml
@@ -0,0 +1,766 @@
+name: "cutadapt"
+version: "v0.2.0"
+authors:
+- name: "Toni Verbeiren"
+  roles:
+  - "author"
+  - "maintainer"
+  info:
+    links:
+      github: "tverbeiren"
+      linkedin: "verbeiren"
+    organizations:
+    - name: "Data Intuitive"
+      href: "https://www.data-intuitive.com"
+      role: "Data Scientist and CEO"
+argument_groups:
+- name: "Specify Adapters for R1"
+  arguments:
+  - type: "string"
+    name: "--adapter"
+    alternatives:
+    - "-a"
+    description: "Sequence of an adapter ligated to the 3' end (paired data:\nof the\
+      \ first read). The adapter and subsequent bases are\ntrimmed. If a '$' character\
+      \ is appended ('anchoring'), the\nadapter is only found if it is a suffix of\
+      \ the read.\n"
+    info: null
+    required: false
+    direction: "input"
+    multiple: true
+    multiple_sep: ";"
+  - type: "string"
+    name: "--front"
+    alternatives:
+    - "-g"
+    description: "Sequence of an adapter ligated to the 5' end (paired data:\nof the\
+      \ first read). The adapter and any preceding bases\nare trimmed. Partial matches\
+      \ at the 5' end are allowed. If\na '^' character is prepended ('anchoring'),\
+      \ the adapter is\nonly found if it is a prefix of the read.\n"
+    info: null
+    required: false
+    direction: "input"
+    multiple: true
+    multiple_sep: ";"
+  - type: "string"
+    name: "--anywhere"
+    alternatives:
+    - "-b"
+    description: "Sequence of an adapter that may be ligated to the 5' or 3'\nend\
+      \ (paired data: of the first read). Both types of\nmatches as described under\
+      \ -a and -g are allowed. If the\nfirst base of the read is part of the match,\
+      \ the behavior\nis as with -g, otherwise as with -a. This option is mostly\n\
+      for rescuing failed library preparations - do not use if\nyou know which end\
+      \ your adapter was ligated to!\n"
+    info: null
+    required: false
+    direction: "input"
+    multiple: true
+    multiple_sep: ";"
+- name: "Specify Adapters using Fasta files for R1"
+  arguments:
+  - type: "file"
+    name: "--adapter_fasta"
+    description: "Fasta file containing sequences of an adapter ligated to the 3'\
+      \ end (paired data:\nof the first read). The adapter and subsequent bases are\n\
+      trimmed. If a '$' character is appended ('anchoring'), the\nadapter is only\
+      \ found if it is a suffix of the read.\n"
+    info: null
+    must_exist: true
+    create_parent: true
+    required: false
+    direction: "input"
+    multiple: true
+    multiple_sep: ";"
+  - type: "file"
+    name: "--front_fasta"
+    description: "Fasta file containing sequences of an adapter ligated to the 5'\
+      \ end (paired data:\nof the first read). The adapter and any preceding bases\n\
+      are trimmed. Partial matches at the 5' end are allowed. If\na '^' character\
+      \ is prepended ('anchoring'), the adapter is\nonly found if it is a prefix of\
+      \ the read.\n"
+    info: null
+    must_exist: true
+    create_parent: true
+    required: false
+    direction: "input"
+    multiple: false
+    multiple_sep: ";"
+  - type: "file"
+    name: "--anywhere_fasta"
+    description: "Fasta file containing sequences of an adapter that may be ligated\
+      \ to the 5' or 3'\nend (paired data: of the first read). Both types of\nmatches\
+      \ as described under -a and -g are allowed. If the\nfirst base of the read is\
+      \ part of the match, the behavior\nis as with -g, otherwise as with -a. This\
+      \ option is mostly\nfor rescuing failed library preparations - do not use if\n\
+      you know which end your adapter was ligated to!\n"
+    info: null
+    must_exist: true
+    create_parent: true
+    required: false
+    direction: "input"
+    multiple: false
+    multiple_sep: ";"
+- name: "Specify Adapters for R2"
+  arguments:
+  - type: "string"
+    name: "--adapter_r2"
+    alternatives:
+    - "-A"
+    description: "Sequence of an adapter ligated to the 3' end (paired data:\nof the\
+      \ first read). The adapter and subsequent bases are\ntrimmed. If a '$' character\
+      \ is appended ('anchoring'), the\nadapter is only found if it is a suffix of\
+      \ the read.\n"
+    info: null
+    required: false
+    direction: "input"
+    multiple: true
+    multiple_sep: ";"
+  - type: "string"
+    name: "--front_r2"
+    alternatives:
+    - "-G"
+    description: "Sequence of an adapter ligated to the 5' end (paired data:\nof the\
+      \ first read). The adapter and any preceding bases\nare trimmed. Partial matches\
+      \ at the 5' end are allowed. If\na '^' character is prepended ('anchoring'),\
+      \ the adapter is\nonly found if it is a prefix of the read.\n"
+    info: null
+    required: false
+    direction: "input"
+    multiple: true
+    multiple_sep: ";"
+  - type: "string"
+    name: "--anywhere_r2"
+    alternatives:
+    - "-B"
+    description: "Sequence of an adapter that may be ligated to the 5' or 3'\nend\
+      \ (paired data: of the first read). Both types of\nmatches as described under\
+      \ -a and -g are allowed. If the\nfirst base of the read is part of the match,\
+      \ the behavior\nis as with -g, otherwise as with -a. This option is mostly\n\
+      for rescuing failed library preparations - do not use if\nyou know which end\
+      \ your adapter was ligated to!\n"
+    info: null
+    required: false
+    direction: "input"
+    multiple: true
+    multiple_sep: ";"
+- name: "Specify Adapters using Fasta files for R2"
+  arguments:
+  - type: "file"
+    name: "--adapter_r2_fasta"
+    description: "Fasta file containing sequences of an adapter ligated to the 3'\
+      \ end (paired data:\nof the first read). The adapter and subsequent bases are\n\
+      trimmed. If a '$' character is appended ('anchoring'), the\nadapter is only\
+      \ found if it is a suffix of the read.\n"
+    info: null
+    must_exist: true
+    create_parent: true
+    required: false
+    direction: "input"
+    multiple: false
+    multiple_sep: ";"
+  - type: "file"
+    name: "--front_r2_fasta"
+    description: "Fasta file containing sequences of an adapter ligated to the 5'\
+      \ end (paired data:\nof the first read). The adapter and any preceding bases\n\
+      are trimmed. Partial matches at the 5' end are allowed. If\na '^' character\
+      \ is prepended ('anchoring'), the adapter is\nonly found if it is a prefix of\
+      \ the read.\n"
+    info: null
+    must_exist: true
+    create_parent: true
+    required: false
+    direction: "input"
+    multiple: false
+    multiple_sep: ";"
+  - type: "file"
+    name: "--anywhere_r2_fasta"
+    description: "Fasta file containing sequences of an adapter that may be ligated\
+      \ to the 5' or 3'\nend (paired data: of the first read). Both types of\nmatches\
+      \ as described under -a and -g are allowed. If the\nfirst base of the read is\
+      \ part of the match, the behavior\nis as with -g, otherwise as with -a. This\
+      \ option is mostly\nfor rescuing failed library preparations - do not use if\n\
+      you know which end your adapter was ligated to!\n"
+    info: null
+    must_exist: true
+    create_parent: true
+    required: false
+    direction: "input"
+    multiple: false
+    multiple_sep: ";"
+- name: "Paired-end options"
+  arguments:
+  - type: "boolean_true"
+    name: "--pair_adapters"
+    description: "Treat adapters given with -a/-A etc. as pairs. Either both\nor none\
+      \ are removed from each read pair.\n"
+    info: null
+    direction: "input"
+  - type: "string"
+    name: "--pair_filter"
+    description: "Which of the reads in a paired-end read have to match the\nfiltering\
+      \ criterion in order for the pair to be filtered.\n"
+    info: null
+    required: false
+    choices:
+    - "any"
+    - "both"
+    - "first"
+    direction: "input"
+    multiple: false
+    multiple_sep: ";"
+  - type: "boolean_true"
+    name: "--interleaved"
+    description: "Read and/or write interleaved paired-end reads.\n"
+    info: null
+    direction: "input"
+- name: "Input parameters"
+  arguments:
+  - type: "file"
+    name: "--input"
+    description: "Input fastq file for single-end reads or R1 for paired-end reads.\n"
+    info: null
+    must_exist: true
+    create_parent: true
+    required: true
+    direction: "input"
+    multiple: false
+    multiple_sep: ";"
+  - type: "file"
+    name: "--input_r2"
+    description: "Input fastq file for R2 in the case of paired-end reads.\n"
+    info: null
+    must_exist: true
+    create_parent: true
+    required: false
+    direction: "input"
+    multiple: false
+    multiple_sep: ";"
+  - type: "double"
+    name: "--error_rate"
+    alternatives:
+    - "-E"
+    - "--errors"
+    description: "Maximum allowed error rate (if 0 <= E < 1), or absolute\nnumber\
+      \ of errors for full-length adapter match (if E is an\ninteger >= 1). Error\
+      \ rate = no. of errors divided by\nlength of matching region. Default: 0.1 (10%).\n"
+    info: null
+    example:
+    - 0.1
+    required: false
+    direction: "input"
+    multiple: false
+    multiple_sep: ";"
+  - type: "boolean_false"
+    name: "--no_indels"
+    description: "Allow only mismatches in alignments.\n"
+    info: null
+    direction: "input"
+  - type: "integer"
+    name: "--times"
+    alternatives:
+    - "-n"
+    description: "Remove up to COUNT adapters from each read. Default: 1.\n"
+    info: null
+    example:
+    - 1
+    required: false
+    direction: "input"
+    multiple: false
+    multiple_sep: ";"
+  - type: "integer"
+    name: "--overlap"
+    alternatives:
+    - "-O"
+    description: "Require MINLENGTH overlap between read and adapter for an\nadapter\
+      \ to be found. The default is 3.\n"
+    info: null
+    example:
+    - 3
+    required: false
+    direction: "input"
+    multiple: false
+    multiple_sep: ";"
+  - type: "boolean_true"
+    name: "--match_read_wildcards"
+    description: "Interpret IUPAC wildcards in reads.\n"
+    info: null
+    direction: "input"
+  - type: "boolean_false"
+    name: "--no_match_adapter_wildcards"
+    description: "Do not interpret IUPAC wildcards in adapters.\n"
+    info: null
+    direction: "input"
+  - type: "string"
+    name: "--action"
+    description: "What to do if a match was found. trim: trim adapter and\nup- or\
+      \ downstream sequence; retain: trim, but retain\nadapter; mask: replace with\
+      \ 'N' characters; lowercase:\nconvert to lowercase; none: leave unchanged.\n\
+      The default is trim.\n"
+    info: null
+    example:
+    - "trim"
+    required: false
+    choices:
+    - "trim"
+    - "retain"
+    - "mask"
+    - "lowercase"
+    - "none"
+    direction: "input"
+    multiple: false
+    multiple_sep: ";"
+  - type: "boolean_true"
+    name: "--revcomp"
+    alternatives:
+    - "--rc"
+    description: "Check both the read and its reverse complement for adapter\nmatches.\
+      \ If match is on reverse-complemented version,\noutput that one.\n"
+    info: null
+    direction: "input"
+- name: "Demultiplexing options"
+  arguments:
+  - type: "string"
+    name: "--demultiplex_mode"
+    description: "Enable demultiplexing and set the mode for it.\nWith mode 'unique_dual',\
+      \ adapters from the first and second read are used,\nand the indexes from the\
+      \ reads are only used in pairs. This implies\n--pair_adapters.\nEnabling mode\
+      \ 'combinatorial_dual' allows all combinations of the sets of indexes\non R1\
+      \ and R2. It is necessary to write each read pair to an output\nfile depending\
+      \ on the adapters found on both R1 and R2.\nMode 'single', uses indexes or barcodes\
+      \ located at the 5'\nend of the R1 read (single). \n"
+    info: null
+    required: false
+    choices:
+    - "single"
+    - "unique_dual"
+    - "combinatorial_dual"
+    direction: "input"
+    multiple: false
+    multiple_sep: ";"
+- name: "Read modifications"
+  arguments:
+  - type: "integer"
+    name: "--cut"
+    alternatives:
+    - "-u"
+    description: "Remove LEN bases from each read (or R1 if paired; use --cut_r2\n\
+      option for R2). If LEN is positive, remove bases from the\nbeginning. If LEN\
+      \ is negative, remove bases from the end.\nCan be used twice if LENs have different\
+      \ signs. Applied\n*before* adapter trimming.\n"
+    info: null
+    required: false
+    direction: "input"
+    multiple: true
+    multiple_sep: ";"
+  - type: "integer"
+    name: "--cut_r2"
+    description: "Remove LEN bases from each read (for R2). If LEN is positive, remove\
+      \ bases from the\nbeginning. If LEN is negative, remove bases from the end.\n\
+      Can be used twice if LENs have different signs. Applied\n*before* adapter trimming.\n"
+    info: null
+    required: false
+    direction: "input"
+    multiple: true
+    multiple_sep: ";"
+  - type: "string"
+    name: "--nextseq_trim"
+    description: "NextSeq-specific quality trimming (each read). Trims also\ndark\
+      \ cycles appearing as high-quality G bases.\n"
+    info: null
+    required: false
+    direction: "input"
+    multiple: false
+    multiple_sep: ";"
+  - type: "string"
+    name: "--quality_cutoff"
+    alternatives:
+    - "-q"
+    description: "Trim low-quality bases from 5' and/or 3' ends of each read\nbefore\
+      \ adapter removal. Applied to both reads if data is\npaired. If one value is\
+      \ given, only the 3' end is trimmed.\nIf two comma-separated cutoffs are given,\
+      \ the 5' end is\ntrimmed with the first cutoff, the 3' end with the second.\n"
+    info: null
+    required: false
+    direction: "input"
+    multiple: false
+    multiple_sep: ";"
+  - type: "string"
+    name: "--quality_cutoff_r2"
+    alternatives:
+    - "-Q"
+    description: "Quality-trimming cutoff for R2. Default: same as for R1\n"
+    info: null
+    required: false
+    direction: "input"
+    multiple: false
+    multiple_sep: ";"
+  - type: "integer"
+    name: "--quality_base"
+    description: "Assume that quality values in FASTQ are encoded as\nascii(quality\
+      \ + N). This needs to be set to 64 for some\nold Illumina FASTQ files. The default\
+      \ is 33.\n"
+    info: null
+    example:
+    - 33
+    required: false
+    direction: "input"
+    multiple: false
+    multiple_sep: ";"
+  - type: "boolean_true"
+    name: "--poly_a"
+    description: "Trim poly-A tails"
+    info: null
+    direction: "input"
+  - type: "integer"
+    name: "--length"
+    alternatives:
+    - "-l"
+    description: "Shorten reads to LENGTH. Positive values remove bases at\nthe end\
+      \ while negative ones remove bases at the beginning.\nThis and the following\
+      \ modifications are applied after\nadapter trimming.\n"
+    info: null
+    required: false
+    direction: "input"
+    multiple: false
+    multiple_sep: ";"
+  - type: "boolean_true"
+    name: "--trim_n"
+    description: "Trim N's on ends of reads."
+    info: null
+    direction: "input"
+  - type: "string"
+    name: "--length_tag"
+    description: "Search for TAG followed by a decimal number in the\ndescription\
+      \ field of the read. Replace the decimal number\nwith the correct length of\
+      \ the trimmed read. For example,\nuse --length-tag 'length=' to correct fields\
+      \ like\n'length=123'.\n"
+    info: null
+    example:
+    - "length="
+    required: false
+    direction: "input"
+    multiple: false
+    multiple_sep: ";"
+  - type: "string"
+    name: "--strip_suffix"
+    description: "Remove this suffix from read names if present. Can be\ngiven multiple\
+      \ times.\n"
+    info: null
+    required: false
+    direction: "input"
+    multiple: false
+    multiple_sep: ";"
+  - type: "string"
+    name: "--prefix"
+    alternatives:
+    - "-x"
+    description: "Add this prefix to read names. Use {name} to insert the\nname of\
+      \ the matching adapter.\n"
+    info: null
+    required: false
+    direction: "input"
+    multiple: false
+    multiple_sep: ";"
+  - type: "string"
+    name: "--suffix"
+    alternatives:
+    - "-y"
+    description: "Add this suffix to read names; can also include {name}\n"
+    info: null
+    required: false
+    direction: "input"
+    multiple: false
+    multiple_sep: ";"
+  - type: "string"
+    name: "--rename"
+    description: "Rename reads using TEMPLATE containing variables such as\n{id},\
+      \ {adapter_name} etc. (see documentation)\n"
+    info: null
+    required: false
+    direction: "input"
+    multiple: false
+    multiple_sep: ";"
+  - type: "boolean_true"
+    name: "--zero_cap"
+    alternatives:
+    - "-z"
+    description: "Change negative quality values to zero."
+    info: null
+    direction: "input"
+- name: "Filtering of processed reads"
+  description: "Filters are applied after above read modifications. Paired-end reads\
+    \ are\nalways discarded pairwise (see also --pair_filter).\n"
+  arguments:
+  - type: "string"
+    name: "--minimum_length"
+    alternatives:
+    - "-m"
+    description: "Discard reads shorter than LEN. Default is 0.\nWhen trimming paired-end\
+      \ reads, the minimum lengths for R1 and R2 can be specified separately by separating\
+      \ them with a colon (:).\nIf the colon syntax is not used, the same minimum\
+      \ length applies to both reads, as discussed above.\nAlso, one of the values\
+      \ can be omitted to impose no restrictions.\nFor example, with -m 17:, the length\
+      \ of R1 must be at least 17, but the length of R2 is ignored.\n"
+    info: null
+    example:
+    - "0"
+    required: false
+    direction: "input"
+    multiple: false
+    multiple_sep: ";"
+  - type: "string"
+    name: "--maximum_length"
+    alternatives:
+    - "-M"
+    description: "Discard reads longer than LEN. Default: no limit.\nFor paired reads,\
+      \ see the remark for --minimum_length\n"
+    info: null
+    required: false
+    direction: "input"
+    multiple: false
+    multiple_sep: ";"
+  - type: "string"
+    name: "--max_n"
+    description: "Discard reads with more than COUNT 'N' bases. If COUNT is\na number\
+      \ between 0 and 1, it is interpreted as a fraction\nof the read length.\n"
+    info: null
+    required: false
+    direction: "input"
+    multiple: false
+    multiple_sep: ";"
+  - type: "long"
+    name: "--max_expected_errors"
+    alternatives:
+    - "--max_ee"
+    description: "Discard reads whose expected number of errors (computed\nfrom quality\
+      \ values) exceeds ERRORS.\n"
+    info: null
+    required: false
+    direction: "input"
+    multiple: false
+    multiple_sep: ";"
+  - type: "long"
+    name: "--max_average_error_rate"
+    alternatives:
+    - "--max_aer"
+    description: "as --max_expected_errors (see above), but divided by\nlength to\
+      \ account for reads of varying length.\n"
+    info: null
+    required: false
+    direction: "input"
+    multiple: false
+    multiple_sep: ";"
+  - type: "boolean_true"
+    name: "--discard_trimmed"
+    alternatives:
+    - "--discard"
+    description: "Discard reads that contain an adapter. Use also -O to\navoid discarding\
+      \ too many randomly matching reads.\n"
+    info: null
+    direction: "input"
+  - type: "boolean_true"
+    name: "--discard_untrimmed"
+    alternatives:
+    - "--trimmed_only"
+    description: "Discard reads that do not contain an adapter.\n"
+    info: null
+    direction: "input"
+  - type: "boolean_true"
+    name: "--discard_casava"
+    description: "Discard reads that did not pass CASAVA filtering (header\nhas :Y:).\n"
+    info: null
+    direction: "input"
+- name: "Output parameters"
+  arguments:
+  - type: "string"
+    name: "--report"
+    description: "Which type of report to print: 'full' (default) or 'minimal'.\n"
+    info: null
+    example:
+    - "full"
+    required: false
+    choices:
+    - "full"
+    - "minimal"
+    direction: "input"
+    multiple: false
+    multiple_sep: ";"
+  - type: "boolean_true"
+    name: "--json"
+    description: "Write report in JSON format to this file.\n"
+    info: null
+    direction: "input"
+  - type: "file"
+    name: "--output"
+    description: "Glob pattern for matching the expected output files.\nShould include\
+      \ `$output_dir`.\n"
+    info: null
+    example:
+    - "fastq/*_001.fast[a,q]"
+    must_exist: true
+    create_parent: true
+    required: true
+    direction: "output"
+    multiple: true
+    multiple_sep: ";"
+  - type: "boolean_true"
+    name: "--fasta"
+    description: "Output FASTA to standard output even on FASTQ input.\n"
+    info: null
+    direction: "input"
+  - type: "boolean_true"
+    name: "--info_file"
+    description: "Write information about each read and its adapter matches\ninto\
+      \ info.txt in the output directory.\nSee the documentation for the file format.\n"
+    info: null
+    direction: "input"
+- name: "Debug"
+  arguments:
+  - type: "boolean_true"
+    name: "--debug"
+    description: "Print debug information"
+    info: null
+    direction: "input"
+resources:
+- type: "bash_script"
+  path: "script.sh"
+  is_executable: true
+description: "Cutadapt removes adapter sequences from high-throughput sequencing reads.\n"
+test_resources:
+- type: "bash_script"
+  path: "test.sh"
+  is_executable: true
+info: null
+status: "enabled"
+requirements:
+  commands:
+  - "ps"
+keywords:
+- "RNA-seq"
+- "scRNA-seq"
+- "high-throughput"
+license: "MIT"
+references:
+  doi:
+  - "10.14806/ej.17.1.200"
+links:
+  repository: "https://github.com/marcelm/cutadapt"
+  homepage: "https://cutadapt.readthedocs.io"
+  documentation: "https://cutadapt.readthedocs.io"
+runners:
+- type: "executable"
+  id: "executable"
+  docker_setup_strategy: "ifneedbepullelsecachedbuild"
+- type: "nextflow"
+  id: "nextflow"
+  directives:
+    tag: "$id"
+  auto:
+    simplifyInput: true
+    simplifyOutput: false
+    transcript: false
+    publish: false
+  config:
+    labels:
+      mem1gb: "memory = 1000000000.B"
+      mem2gb: "memory = 2000000000.B"
+      mem5gb: "memory = 5000000000.B"
+      mem10gb: "memory = 10000000000.B"
+      mem20gb: "memory = 20000000000.B"
+      mem50gb: "memory = 50000000000.B"
+      mem100gb: "memory = 100000000000.B"
+      mem200gb: "memory = 200000000000.B"
+      mem500gb: "memory = 500000000000.B"
+      mem1tb: "memory = 1000000000000.B"
+      mem2tb: "memory = 2000000000000.B"
+      mem5tb: "memory = 5000000000000.B"
+      mem10tb: "memory = 10000000000000.B"
+      mem20tb: "memory = 20000000000000.B"
+      mem50tb: "memory = 50000000000000.B"
+      mem100tb: "memory = 100000000000000.B"
+      mem200tb: "memory = 200000000000000.B"
+      mem500tb: "memory = 500000000000000.B"
+      mem1gib: "memory = 1073741824.B"
+      mem2gib: "memory = 2147483648.B"
+      mem4gib: "memory = 4294967296.B"
+      mem8gib: "memory = 8589934592.B"
+      mem16gib: "memory = 17179869184.B"
+      mem32gib: "memory = 34359738368.B"
+      mem64gib: "memory = 68719476736.B"
+      mem128gib: "memory = 137438953472.B"
+      mem256gib: "memory = 274877906944.B"
+      mem512gib: "memory = 549755813888.B"
+      mem1tib: "memory = 1099511627776.B"
+      mem2tib: "memory = 2199023255552.B"
+      mem4tib: "memory = 4398046511104.B"
+      mem8tib: "memory = 8796093022208.B"
+      mem16tib: "memory = 17592186044416.B"
+      mem32tib: "memory = 35184372088832.B"
+      mem64tib: "memory = 70368744177664.B"
+      mem128tib: "memory = 140737488355328.B"
+      mem256tib: "memory = 281474976710656.B"
+      mem512tib: "memory = 562949953421312.B"
+      cpu1: "cpus = 1"
+      cpu2: "cpus = 2"
+      cpu5: "cpus = 5"
+      cpu10: "cpus = 10"
+      cpu20: "cpus = 20"
+      cpu50: "cpus = 50"
+      cpu100: "cpus = 100"
+      cpu200: "cpus = 200"
+      cpu500: "cpus = 500"
+      cpu1000: "cpus = 1000"
+  debug: false
+  container: "docker"
+engines:
+- type: "docker"
+  id: "docker"
+  image: "python:3.12"
+  target_registry: "images.viash-hub.com"
+  target_tag: "v0.2.0"
+  namespace_separator: "/"
+  setup:
+  - type: "python"
+    user: false
+    pip:
+    - "cutadapt"
+    upgrade: true
+  - type: "docker"
+    run:
+    - "cutadapt --version | sed 's/\\(.*\\)/cutadapt: \"\\1\"/' > /var/software_versions.txt\n"
+  entrypoint: []
+  cmd: null
+- type: "native"
+  id: "native"
+build_info:
+  config: "src/cutadapt/config.vsh.yaml"
+  runner: "nextflow"
+  engine: "docker|native"
+  output: "target/nextflow/cutadapt"
+  executable: "target/nextflow/cutadapt/main.nf"
+  viash_version: "0.9.0"
+  git_commit: "7e530218844c373048bc33de58f021b6460642e5"
+  git_remote: "https://x-access-token:ghs_kiUBq39QrAlnG6IaeAcTcXhllzqpOV4LDB3e@github.com/viash-hub/biobox"
+package_config:
+  name: "biobox"
+  version: "v0.2.0"
+  description: "A collection of bioinformatics tools for working with sequence data.\n"
+  info: null
+  viash_version: "0.9.0"
+  source: "src"
+  target: "target"
+  config_mods:
+  - ".requirements.commands := ['ps']\n"
+  - ".engines += { type: \"native\" }"
+  - ".engines[.type == 'docker'].target_registry := 'images.viash-hub.com'"
+  - ".engines[.type == 'docker'].target_tag := 'v0.2.0'"
+  keywords:
+  - "bioinformatics"
+  - "modules"
+  - "sequencing"
+  license: "MIT"
+  organization: "vsh"
+  links:
+    repository: "https://github.com/viash-hub/biobox"
+    issue_tracker: "https://github.com/viash-hub/biobox/issues"
--- a/target/dependencies/vsh/vsh/biobox/v0.2.0/nextflow/cutadapt/main.nf
+++ b/target/dependencies/vsh/vsh/biobox/v0.2.0/nextflow/cutadapt/main.nf
--- a/target/dependencies/vsh/vsh/biobox/v0.2.0/nextflow/cutadapt/nextflow.config
+++ b/target/dependencies/vsh/vsh/biobox/v0.2.0/nextflow/cutadapt/nextflow.config
@@ -0,0 +1,126 @@
+manifest {
+  name = 'cutadapt'
+  mainScript = 'main.nf'
+  nextflowVersion = '!>=20.12.1-edge'
+  version = 'v0.2.0'
+  description = 'Cutadapt removes adapter sequences from high-throughput sequencing reads.\n'
+  author = 'Toni Verbeiren'
+}
+
+process.container = 'nextflow/bash:latest'
+
+// detect tempdir
+tempDir = java.nio.file.Paths.get(
+  System.getenv('NXF_TEMP') ?:
+    System.getenv('VIASH_TEMP') ?: 
+    System.getenv('TEMPDIR') ?: 
+    System.getenv('TMPDIR') ?: 
+    '/tmp'
+).toAbsolutePath()
+
+profiles {
+  no_publish {
+    process {
+      withName: '.*' {
+        publishDir = [
+          enabled: false
+        ]
+      }
+    }
+  }
+  mount_temp {
+    docker.temp            = tempDir
+    podman.temp            = tempDir
+    charliecloud.temp      = tempDir
+  }
+  docker {
+    docker.enabled         = true
+    // docker.userEmulation   = true
+    singularity.enabled    = false
+    podman.enabled         = false
+    shifter.enabled        = false
+    charliecloud.enabled   = false
+  }
+  singularity {
+    singularity.enabled    = true
+    singularity.autoMounts = true
+    docker.enabled         = false
+    podman.enabled         = false
+    shifter.enabled        = false
+    charliecloud.enabled   = false
+  }
+  podman {
+    podman.enabled         = true
+    docker.enabled         = false
+    singularity.enabled    = false
+    shifter.enabled        = false
+    charliecloud.enabled   = false
+  }
+  shifter {
+    shifter.enabled        = true
+    docker.enabled         = false
+    singularity.enabled    = false
+    podman.enabled         = false
+    charliecloud.enabled   = false
+  }
+  charliecloud {
+    charliecloud.enabled   = true
+    docker.enabled         = false
+    singularity.enabled    = false
+    podman.enabled         = false
+    shifter.enabled        = false
+  }
+}
+
+process{
+  withLabel: mem1gb { memory = 1000000000.B }
+  withLabel: mem2gb { memory = 2000000000.B }
+  withLabel: mem5gb { memory = 5000000000.B }
+  withLabel: mem10gb { memory = 10000000000.B }
+  withLabel: mem20gb { memory = 20000000000.B }
+  withLabel: mem50gb { memory = 50000000000.B }
+  withLabel: mem100gb { memory = 100000000000.B }
+  withLabel: mem200gb { memory = 200000000000.B }
+  withLabel: mem500gb { memory = 500000000000.B }
+  withLabel: mem1tb { memory = 1000000000000.B }
+  withLabel: mem2tb { memory = 2000000000000.B }
+  withLabel: mem5tb { memory = 5000000000000.B }
+  withLabel: mem10tb { memory = 10000000000000.B }
+  withLabel: mem20tb { memory = 20000000000000.B }
+  withLabel: mem50tb { memory = 50000000000000.B }
+  withLabel: mem100tb { memory = 100000000000000.B }
+  withLabel: mem200tb { memory = 200000000000000.B }
+  withLabel: mem500tb { memory = 500000000000000.B }
+  withLabel: mem1gib { memory = 1073741824.B }
+  withLabel: mem2gib { memory = 2147483648.B }
+  withLabel: mem4gib { memory = 4294967296.B }
+  withLabel: mem8gib { memory = 8589934592.B }
+  withLabel: mem16gib { memory = 17179869184.B }
+  withLabel: mem32gib { memory = 34359738368.B }
+  withLabel: mem64gib { memory = 68719476736.B }
+  withLabel: mem128gib { memory = 137438953472.B }
+  withLabel: mem256gib { memory = 274877906944.B }
+  withLabel: mem512gib { memory = 549755813888.B }
+  withLabel: mem1tib { memory = 1099511627776.B }
+  withLabel: mem2tib { memory = 2199023255552.B }
+  withLabel: mem4tib { memory = 4398046511104.B }
+  withLabel: mem8tib { memory = 8796093022208.B }
+  withLabel: mem16tib { memory = 17592186044416.B }
+  withLabel: mem32tib { memory = 35184372088832.B }
+  withLabel: mem64tib { memory = 70368744177664.B }
+  withLabel: mem128tib { memory = 140737488355328.B }
+  withLabel: mem256tib { memory = 281474976710656.B }
+  withLabel: mem512tib { memory = 562949953421312.B }
+  withLabel: cpu1 { cpus = 1 }
+  withLabel: cpu2 { cpus = 2 }
+  withLabel: cpu5 { cpus = 5 }
+  withLabel: cpu10 { cpus = 10 }
+  withLabel: cpu20 { cpus = 20 }
+  withLabel: cpu50 { cpus = 50 }
+  withLabel: cpu100 { cpus = 100 }
+  withLabel: cpu200 { cpus = 200 }
+  withLabel: cpu500 { cpus = 500 }
+  withLabel: cpu1000 { cpus = 1000 }
+}
+
+
--- a/target/dependencies/vsh/vsh/biobox/v0.2.0/nextflow/cutadapt/nextflow_schema.json
+++ b/target/dependencies/vsh/vsh/biobox/v0.2.0/nextflow/cutadapt/nextflow_schema.json
@@ -0,0 +1,775 @@
+{
+"$schema": "http://json-schema.org/draft-07/schema",
+"title": "cutadapt",
+"description": "Cutadapt removes adapter sequences from high-throughput sequencing reads.\n",
+"type": "object",
+"definitions": {
+
+    
+    
+    "specify adapters for r1" : {
+    "title": "Specify Adapters for R1",
+    "type": "object",
+    "description": "No description",
+    "properties": {
+    
+        
+                "adapter": {
+                "type":
+                "string",
+                "description": "Type: List of `string`, multiple_sep: `\";\"`. Sequence of an adapter ligated to the 3\u0027 end (paired data:\nof the first read)",
+                "help_text": "Type: List of `string`, multiple_sep: `\";\"`. Sequence of an adapter ligated to the 3\u0027 end (paired data:\nof the first read). The adapter and subsequent bases are\ntrimmed. If a \u0027$\u0027 character is appended (\u0027anchoring\u0027), the\nadapter is only found if it is a suffix of the read.\n"
+            
+            }
+    
+
+        ,
+                "front": {
+                "type":
+                "string",
+                "description": "Type: List of `string`, multiple_sep: `\";\"`. Sequence of an adapter ligated to the 5\u0027 end (paired data:\nof the first read)",
+                "help_text": "Type: List of `string`, multiple_sep: `\";\"`. Sequence of an adapter ligated to the 5\u0027 end (paired data:\nof the first read). The adapter and any preceding bases\nare trimmed. Partial matches at the 5\u0027 end are allowed. If\na \u0027^\u0027 character is prepended (\u0027anchoring\u0027), the adapter is\nonly found if it is a prefix of the read.\n"
+            
+            }
+    
+
+        ,
+                "anywhere": {
+                "type":
+                "string",
+                "description": "Type: List of `string`, multiple_sep: `\";\"`. Sequence of an adapter that may be ligated to the 5\u0027 or 3\u0027\nend (paired data: of the first read)",
+                "help_text": "Type: List of `string`, multiple_sep: `\";\"`. Sequence of an adapter that may be ligated to the 5\u0027 or 3\u0027\nend (paired data: of the first read). Both types of\nmatches as described under -a and -g are allowed. If the\nfirst base of the read is part of the match, the behavior\nis as with -g, otherwise as with -a. This option is mostly\nfor rescuing failed library preparations - do not use if\nyou know which end your adapter was ligated to!\n"
+            
+            }
+    
+
+}
+},
+    
+    
+    "specify adapters using fasta files for r1" : {
+    "title": "Specify Adapters using Fasta files for R1",
+    "type": "object",
+    "description": "No description",
+    "properties": {
+    
+        
+                "adapter_fasta": {
+                "type":
+                "string",
+                "description": "Type: List of `file`, multiple_sep: `\";\"`. Fasta file containing sequences of an adapter ligated to the 3\u0027 end (paired data:\nof the first read)",
+                "help_text": "Type: List of `file`, multiple_sep: `\";\"`. Fasta file containing sequences of an adapter ligated to the 3\u0027 end (paired data:\nof the first read). The adapter and subsequent bases are\ntrimmed. If a \u0027$\u0027 character is appended (\u0027anchoring\u0027), the\nadapter is only found if it is a suffix of the read.\n"
+            
+            }
+    
+
+        ,
+                "front_fasta": {
+                "type":
+                "string",
+                "description": "Type: `file`. Fasta file containing sequences of an adapter ligated to the 5\u0027 end (paired data:\nof the first read)",
+                "help_text": "Type: `file`. Fasta file containing sequences of an adapter ligated to the 5\u0027 end (paired data:\nof the first read). The adapter and any preceding bases\nare trimmed. Partial matches at the 5\u0027 end are allowed. If\na \u0027^\u0027 character is prepended (\u0027anchoring\u0027), the adapter is\nonly found if it is a prefix of the read.\n"
+            
+            }
+    
+
+        ,
+                "anywhere_fasta": {
+                "type":
+                "string",
+                "description": "Type: `file`. Fasta file containing sequences of an adapter that may be ligated to the 5\u0027 or 3\u0027\nend (paired data: of the first read)",
+                "help_text": "Type: `file`. Fasta file containing sequences of an adapter that may be ligated to the 5\u0027 or 3\u0027\nend (paired data: of the first read). Both types of\nmatches as described under -a and -g are allowed. If the\nfirst base of the read is part of the match, the behavior\nis as with -g, otherwise as with -a. This option is mostly\nfor rescuing failed library preparations - do not use if\nyou know which end your adapter was ligated to!\n"
+            
+            }
+    
+
+}
+},
+    
+    
+    "specify adapters for r2" : {
+    "title": "Specify Adapters for R2",
+    "type": "object",
+    "description": "No description",
+    "properties": {
+    
+        
+                "adapter_r2": {
+                "type":
+                "string",
+                "description": "Type: List of `string`, multiple_sep: `\";\"`. Sequence of an adapter ligated to the 3\u0027 end (paired data:\nof the first read)",
+                "help_text": "Type: List of `string`, multiple_sep: `\";\"`. Sequence of an adapter ligated to the 3\u0027 end (paired data:\nof the first read). The adapter and subsequent bases are\ntrimmed. If a \u0027$\u0027 character is appended (\u0027anchoring\u0027), the\nadapter is only found if it is a suffix of the read.\n"
+            
+            }
+    
+
+        ,
+                "front_r2": {
+                "type":
+                "string",
+                "description": "Type: List of `string`, multiple_sep: `\";\"`. Sequence of an adapter ligated to the 5\u0027 end (paired data:\nof the first read)",
+                "help_text": "Type: List of `string`, multiple_sep: `\";\"`. Sequence of an adapter ligated to the 5\u0027 end (paired data:\nof the first read). The adapter and any preceding bases\nare trimmed. Partial matches at the 5\u0027 end are allowed. If\na \u0027^\u0027 character is prepended (\u0027anchoring\u0027), the adapter is\nonly found if it is a prefix of the read.\n"
+            
+            }
+    
+
+        ,
+                "anywhere_r2": {
+                "type":
+                "string",
+                "description": "Type: List of `string`, multiple_sep: `\";\"`. Sequence of an adapter that may be ligated to the 5\u0027 or 3\u0027\nend (paired data: of the first read)",
+                "help_text": "Type: List of `string`, multiple_sep: `\";\"`. Sequence of an adapter that may be ligated to the 5\u0027 or 3\u0027\nend (paired data: of the first read). Both types of\nmatches as described under -a and -g are allowed. If the\nfirst base of the read is part of the match, the behavior\nis as with -g, otherwise as with -a. This option is mostly\nfor rescuing failed library preparations - do not use if\nyou know which end your adapter was ligated to!\n"
+            
+            }
+    
+
+}
+},
+    
+    
+    "specify adapters using fasta files for r2" : {
+    "title": "Specify Adapters using Fasta files for R2",
+    "type": "object",
+    "description": "No description",
+    "properties": {
+    
+        
+                "adapter_r2_fasta": {
+                "type":
+                "string",
+                "description": "Type: `file`. Fasta file containing sequences of an adapter ligated to the 3\u0027 end (paired data:\nof the first read)",
+                "help_text": "Type: `file`. Fasta file containing sequences of an adapter ligated to the 3\u0027 end (paired data:\nof the first read). The adapter and subsequent bases are\ntrimmed. If a \u0027$\u0027 character is appended (\u0027anchoring\u0027), the\nadapter is only found if it is a suffix of the read.\n"
+            
+            }
+    
+
+        ,
+                "front_r2_fasta": {
+                "type":
+                "string",
+                "description": "Type: `file`. Fasta file containing sequences of an adapter ligated to the 5\u0027 end (paired data:\nof the first read)",
+                "help_text": "Type: `file`. Fasta file containing sequences of an adapter ligated to the 5\u0027 end (paired data:\nof the first read). The adapter and any preceding bases\nare trimmed. Partial matches at the 5\u0027 end are allowed. If\na \u0027^\u0027 character is prepended (\u0027anchoring\u0027), the adapter is\nonly found if it is a prefix of the read.\n"
+            
+            }
+    
+
+        ,
+                "anywhere_r2_fasta": {
+                "type":
+                "string",
+                "description": "Type: `file`. Fasta file containing sequences of an adapter that may be ligated to the 5\u0027 or 3\u0027\nend (paired data: of the first read)",
+                "help_text": "Type: `file`. Fasta file containing sequences of an adapter that may be ligated to the 5\u0027 or 3\u0027\nend (paired data: of the first read). Both types of\nmatches as described under -a and -g are allowed. If the\nfirst base of the read is part of the match, the behavior\nis as with -g, otherwise as with -a. This option is mostly\nfor rescuing failed library preparations - do not use if\nyou know which end your adapter was ligated to!\n"
+            
+            }
+    
+
+}
+},
+    
+    
+    "paired-end options" : {
+    "title": "Paired-end options",
+    "type": "object",
+    "description": "No description",
+    "properties": {
+    
+        
+                "pair_adapters": {
+                "type":
+                "boolean",
+                "description": "Type: `boolean_true`, default: `false`. Treat adapters given with -a/-A etc",
+                "help_text": "Type: `boolean_true`, default: `false`. Treat adapters given with -a/-A etc. as pairs. Either both\nor none are removed from each read pair.\n"
+            ,
+                "default": "False"
+            }
+    
+
+        ,
+                "pair_filter": {
+                "type":
+                "string",
+                "description": "Type: `string`, choices: ``any`, `both`, `first``. Which of the reads in a paired-end read have to match the\nfiltering criterion in order for the pair to be filtered",
+                "help_text": "Type: `string`, choices: ``any`, `both`, `first``. Which of the reads in a paired-end read have to match the\nfiltering criterion in order for the pair to be filtered.\n",
+                "enum": ["any", "both", "first"]
+            
+            
+            }
+    
+
+        ,
+                "interleaved": {
+                "type":
+                "boolean",
+                "description": "Type: `boolean_true`, default: `false`. Read and/or write interleaved paired-end reads",
+                "help_text": "Type: `boolean_true`, default: `false`. Read and/or write interleaved paired-end reads.\n"
+            ,
+                "default": "False"
+            }
+    
+
+}
+},
+    
+    
+    "input parameters" : {
+    "title": "Input parameters",
+    "type": "object",
+    "description": "No description",
+    "properties": {
+    
+        
+                "input": {
+                "type":
+                "string",
+                "description": "Type: `file`, required. Input fastq file for single-end reads or R1 for paired-end reads",
+                "help_text": "Type: `file`, required. Input fastq file for single-end reads or R1 for paired-end reads.\n"
+            
+            }
+    
+
+        ,
+                "input_r2": {
+                "type":
+                "string",
+                "description": "Type: `file`. Input fastq file for R2 in the case of paired-end reads",
+                "help_text": "Type: `file`. Input fastq file for R2 in the case of paired-end reads.\n"
+            
+            }
+    
+
+        ,
+                "error_rate": {
+                "type":
+                "number",
+                "description": "Type: `double`, example: `0.1`. Maximum allowed error rate (if 0 \u003c= E \u003c 1), or absolute\nnumber of errors for full-length adapter match (if E is an\ninteger \u003e= 1)",
+                "help_text": "Type: `double`, example: `0.1`. Maximum allowed error rate (if 0 \u003c= E \u003c 1), or absolute\nnumber of errors for full-length adapter match (if E is an\ninteger \u003e= 1). Error rate = no. of errors divided by\nlength of matching region. Default: 0.1 (10%).\n"
+            
+            }
+    
+
+        ,
+                "no_indels": {
+                "type":
+                "boolean",
+                "description": "Type: `boolean_false`, default: `true`. Allow only mismatches in alignments",
+                "help_text": "Type: `boolean_false`, default: `true`. Allow only mismatches in alignments.\n"
+            ,
+                "default": "True"
+            }
+    
+
+        ,
+                "times": {
+                "type":
+                "integer",
+                "description": "Type: `integer`, example: `1`. Remove up to COUNT adapters from each read",
+                "help_text": "Type: `integer`, example: `1`. Remove up to COUNT adapters from each read. Default: 1.\n"
+            
+            }
+    
+
+        ,
+                "overlap": {
+                "type":
+                "integer",
+                "description": "Type: `integer`, example: `3`. Require MINLENGTH overlap between read and adapter for an\nadapter to be found",
+                "help_text": "Type: `integer`, example: `3`. Require MINLENGTH overlap between read and adapter for an\nadapter to be found. The default is 3.\n"
+            
+            }
+    
+
+        ,
+                "match_read_wildcards": {
+                "type":
+                "boolean",
+                "description": "Type: `boolean_true`, default: `false`. Interpret IUPAC wildcards in reads",
+                "help_text": "Type: `boolean_true`, default: `false`. Interpret IUPAC wildcards in reads.\n"
+            ,
+                "default": "False"
+            }
+    
+
+        ,
+                "no_match_adapter_wildcards": {
+                "type":
+                "boolean",
+                "description": "Type: `boolean_false`, default: `true`. Do not interpret IUPAC wildcards in adapters",
+                "help_text": "Type: `boolean_false`, default: `true`. Do not interpret IUPAC wildcards in adapters.\n"
+            ,
+                "default": "True"
+            }
+    
+
+        ,
+                "action": {
+                "type":
+                "string",
+                "description": "Type: `string`, example: `trim`, choices: ``trim`, `retain`, `mask`, `lowercase`, `none``. What to do if a match was found",
+                "help_text": "Type: `string`, example: `trim`, choices: ``trim`, `retain`, `mask`, `lowercase`, `none``. What to do if a match was found. trim: trim adapter and\nup- or downstream sequence; retain: trim, but retain\nadapter; mask: replace with \u0027N\u0027 characters; lowercase:\nconvert to lowercase; none: leave unchanged.\nThe default is trim.\n",
+                "enum": ["trim", "retain", "mask", "lowercase", "none"]
+            
+            
+            }
+    
+
+        ,
+                "revcomp": {
+                "type":
+                "boolean",
+                "description": "Type: `boolean_true`, default: `false`. Check both the read and its reverse complement for adapter\nmatches",
+                "help_text": "Type: `boolean_true`, default: `false`. Check both the read and its reverse complement for adapter\nmatches. If match is on reverse-complemented version,\noutput that one.\n"
+            ,
+                "default": "False"
+            }
+    
+
+}
+},
+    
+    
+    "demultiplexing options" : {
+    "title": "Demultiplexing options",
+    "type": "object",
+    "description": "No description",
+    "properties": {
+    
+        
+                "demultiplex_mode": {
+                "type":
+                "string",
+                "description": "Type: `string`, choices: ``single`, `unique_dual`, `combinatorial_dual``. Enable demultiplexing and set the mode for it",
+                "help_text": "Type: `string`, choices: ``single`, `unique_dual`, `combinatorial_dual``. Enable demultiplexing and set the mode for it.\nWith mode \u0027unique_dual\u0027, adapters from the first and second read are used,\nand the indexes from the reads are only used in pairs. This implies\n--pair_adapters.\nEnabling mode \u0027combinatorial_dual\u0027 allows all combinations of the sets of indexes\non R1 and R2. It is necessary to write each read pair to an output\nfile depending on the adapters found on both R1 and R2.\nMode \u0027single\u0027, uses indexes or barcodes located at the 5\u0027\nend of the R1 read (single). \n",
+                "enum": ["single", "unique_dual", "combinatorial_dual"]
+            
+            
+            }
+    
+
+}
+},
+    
+    
+    "read modifications" : {
+    "title": "Read modifications",
+    "type": "object",
+    "description": "No description",
+    "properties": {
+    
+        
+                "cut": {
+                "type":
+                "string",
+                "description": "Type: List of `integer`, multiple_sep: `\";\"`. Remove LEN bases from each read (or R1 if paired; use --cut_r2\noption for R2)",
+                "help_text": "Type: List of `integer`, multiple_sep: `\";\"`. Remove LEN bases from each read (or R1 if paired; use --cut_r2\noption for R2). If LEN is positive, remove bases from the\nbeginning. If LEN is negative, remove bases from the end.\nCan be used twice if LENs have different signs. Applied\n*before* adapter trimming.\n"
+            
+            }
+    
+
+        ,
+                "cut_r2": {
+                "type":
+                "string",
+                "description": "Type: List of `integer`, multiple_sep: `\";\"`. Remove LEN bases from each read (for R2)",
+                "help_text": "Type: List of `integer`, multiple_sep: `\";\"`. Remove LEN bases from each read (for R2). If LEN is positive, remove bases from the\nbeginning. If LEN is negative, remove bases from the end.\nCan be used twice if LENs have different signs. Applied\n*before* adapter trimming.\n"
+            
+            }
+    
+
+        ,
+                "nextseq_trim": {
+                "type":
+                "string",
+                "description": "Type: `string`. NextSeq-specific quality trimming (each read)",
+                "help_text": "Type: `string`. NextSeq-specific quality trimming (each read). Trims also\ndark cycles appearing as high-quality G bases.\n"
+            
+            }
+    
+
+        ,
+                "quality_cutoff": {
+                "type":
+                "string",
+                "description": "Type: `string`. Trim low-quality bases from 5\u0027 and/or 3\u0027 ends of each read\nbefore adapter removal",
+                "help_text": "Type: `string`. Trim low-quality bases from 5\u0027 and/or 3\u0027 ends of each read\nbefore adapter removal. Applied to both reads if data is\npaired. If one value is given, only the 3\u0027 end is trimmed.\nIf two comma-separated cutoffs are given, the 5\u0027 end is\ntrimmed with the first cutoff, the 3\u0027 end with the second.\n"
+            
+            }
+    
+
+        ,
+                "quality_cutoff_r2": {
+                "type":
+                "string",
+                "description": "Type: `string`. Quality-trimming cutoff for R2",
+                "help_text": "Type: `string`. Quality-trimming cutoff for R2. Default: same as for R1\n"
+            
+            }
+    
+
+        ,
+                "quality_base": {
+                "type":
+                "integer",
+                "description": "Type: `integer`, example: `33`. Assume that quality values in FASTQ are encoded as\nascii(quality + N)",
+                "help_text": "Type: `integer`, example: `33`. Assume that quality values in FASTQ are encoded as\nascii(quality + N). This needs to be set to 64 for some\nold Illumina FASTQ files. The default is 33.\n"
+            
+            }
+    
+
+        ,
+                "poly_a": {
+                "type":
+                "boolean",
+                "description": "Type: `boolean_true`, default: `false`. Trim poly-A tails",
+                "help_text": "Type: `boolean_true`, default: `false`. Trim poly-A tails"
+            ,
+                "default": "False"
+            }
+    
+
+        ,
+                "length": {
+                "type":
+                "integer",
+                "description": "Type: `integer`. Shorten reads to LENGTH",
+                "help_text": "Type: `integer`. Shorten reads to LENGTH. Positive values remove bases at\nthe end while negative ones remove bases at the beginning.\nThis and the following modifications are applied after\nadapter trimming.\n"
+            
+            }
+    
+
+        ,
+                "trim_n": {
+                "type":
+                "boolean",
+                "description": "Type: `boolean_true`, default: `false`. Trim N\u0027s on ends of reads",
+                "help_text": "Type: `boolean_true`, default: `false`. Trim N\u0027s on ends of reads."
+            ,
+                "default": "False"
+            }
+    
+
+        ,
+                "length_tag": {
+                "type":
+                "string",
+                "description": "Type: `string`, example: `length=`. Search for TAG followed by a decimal number in the\ndescription field of the read",
+                "help_text": "Type: `string`, example: `length=`. Search for TAG followed by a decimal number in the\ndescription field of the read. Replace the decimal number\nwith the correct length of the trimmed read. For example,\nuse --length-tag \u0027length=\u0027 to correct fields like\n\u0027length=123\u0027.\n"
+            
+            }
+    
+
+        ,
+                "strip_suffix": {
+                "type":
+                "string",
+                "description": "Type: `string`. Remove this suffix from read names if present",
+                "help_text": "Type: `string`. Remove this suffix from read names if present. Can be\ngiven multiple times.\n"
+            
+            }
+    
+
+        ,
+                "prefix": {
+                "type":
+                "string",
+                "description": "Type: `string`. Add this prefix to read names",
+                "help_text": "Type: `string`. Add this prefix to read names. Use {name} to insert the\nname of the matching adapter.\n"
+            
+            }
+    
+
+        ,
+                "suffix": {
+                "type":
+                "string",
+                "description": "Type: `string`. Add this suffix to read names; can also include {name}\n",
+                "help_text": "Type: `string`. Add this suffix to read names; can also include {name}\n"
+            
+            }
+    
+
+        ,
+                "rename": {
+                "type":
+                "string",
+                "description": "Type: `string`. Rename reads using TEMPLATE containing variables such as\n{id}, {adapter_name} etc",
+                "help_text": "Type: `string`. Rename reads using TEMPLATE containing variables such as\n{id}, {adapter_name} etc. (see documentation)\n"
+            
+            }
+    
+
+        ,
+                "zero_cap": {
+                "type":
+                "boolean",
+                "description": "Type: `boolean_true`, default: `false`. Change negative quality values to zero",
+                "help_text": "Type: `boolean_true`, default: `false`. Change negative quality values to zero."
+            ,
+                "default": "False"
+            }
+    
+
+}
+},
+    
+    
+    "filtering of processed reads" : {
+    "title": "Filtering of processed reads",
+    "type": "object",
+    "description": "Filters are applied after above read modifications. Paired-end reads are\nalways discarded pairwise (see also --pair_filter).\n",
+    "properties": {
+    
+        
+                "minimum_length": {
+                "type":
+                "string",
+                "description": "Type: `string`, example: `0`. Discard reads shorter than LEN",
+                "help_text": "Type: `string`, example: `0`. Discard reads shorter than LEN. Default is 0.\nWhen trimming paired-end reads, the minimum lengths for R1 and R2 can be specified separately by separating them with a colon (:).\nIf the colon syntax is not used, the same minimum length applies to both reads, as discussed above.\nAlso, one of the values can be omitted to impose no restrictions.\nFor example, with -m 17:, the length of R1 must be at least 17, but the length of R2 is ignored.\n"
+            
+            }
+    
+
+        ,
+                "maximum_length": {
+                "type":
+                "string",
+                "description": "Type: `string`. Discard reads longer than LEN",
+                "help_text": "Type: `string`. Discard reads longer than LEN. Default: no limit.\nFor paired reads, see the remark for --minimum_length\n"
+            
+            }
+    
+
+        ,
+                "max_n": {
+                "type":
+                "string",
+                "description": "Type: `string`. Discard reads with more than COUNT \u0027N\u0027 bases",
+                "help_text": "Type: `string`. Discard reads with more than COUNT \u0027N\u0027 bases. If COUNT is\na number between 0 and 1, it is interpreted as a fraction\nof the read length.\n"
+            
+            }
+    
+
+        ,
+                "max_expected_errors": {
+                "type":
+                "string",
+                "description": "Type: `long`. Discard reads whose expected number of errors (computed\nfrom quality values) exceeds ERRORS",
+                "help_text": "Type: `long`. Discard reads whose expected number of errors (computed\nfrom quality values) exceeds ERRORS.\n"
+            
+            }
+    
+
+        ,
+                "max_average_error_rate": {
+                "type":
+                "string",
+                "description": "Type: `long`. as --max_expected_errors (see above), but divided by\nlength to account for reads of varying length",
+                "help_text": "Type: `long`. as --max_expected_errors (see above), but divided by\nlength to account for reads of varying length.\n"
+            
+            }
+    
+
+        ,
+                "discard_trimmed": {
+                "type":
+                "boolean",
+                "description": "Type: `boolean_true`, default: `false`. Discard reads that contain an adapter",
+                "help_text": "Type: `boolean_true`, default: `false`. Discard reads that contain an adapter. Use also -O to\navoid discarding too many randomly matching reads.\n"
+            ,
+                "default": "False"
+            }
+    
+
+        ,
+                "discard_untrimmed": {
+                "type":
+                "boolean",
+                "description": "Type: `boolean_true`, default: `false`. Discard reads that do not contain an adapter",
+                "help_text": "Type: `boolean_true`, default: `false`. Discard reads that do not contain an adapter.\n"
+            ,
+                "default": "False"
+            }
+    
+
+        ,
+                "discard_casava": {
+                "type":
+                "boolean",
+                "description": "Type: `boolean_true`, default: `false`. Discard reads that did not pass CASAVA filtering (header\nhas :Y:)",
+                "help_text": "Type: `boolean_true`, default: `false`. Discard reads that did not pass CASAVA filtering (header\nhas :Y:).\n"
+            ,
+                "default": "False"
+            }
+    
+
+}
+},
+    
+    
+    "output parameters" : {
+    "title": "Output parameters",
+    "type": "object",
+    "description": "No description",
+    "properties": {
+    
+        
+                "report": {
+                "type":
+                "string",
+                "description": "Type: `string`, example: `full`, choices: ``full`, `minimal``. Which type of report to print: \u0027full\u0027 (default) or \u0027minimal\u0027",
+                "help_text": "Type: `string`, example: `full`, choices: ``full`, `minimal``. Which type of report to print: \u0027full\u0027 (default) or \u0027minimal\u0027.\n",
+                "enum": ["full", "minimal"]
+            
+            
+            }
+    
+
+        ,
+                "json": {
+                "type":
+                "boolean",
+                "description": "Type: `boolean_true`, default: `false`. Write report in JSON format to this file",
+                "help_text": "Type: `boolean_true`, default: `false`. Write report in JSON format to this file.\n"
+            ,
+                "default": "False"
+            }
+    
+
+        ,
+                "output": {
+                "type":
+                "string",
+                "description": "Type: List of `file`, required, default: `$id.$key.output_*.fast[a,q]`, example: `fastq/*_001.fast[a,q]`, multiple_sep: `\";\"`. Glob pattern for matching the expected output files",
+                "help_text": "Type: List of `file`, required, default: `$id.$key.output_*.fast[a,q]`, example: `fastq/*_001.fast[a,q]`, multiple_sep: `\";\"`. Glob pattern for matching the expected output files.\nShould include `$output_dir`.\n"
+            ,
+                "default": "$id.$key.output_*.fast[a,q]"
+            }
+    
+
+        ,
+                "fasta": {
+                "type":
+                "boolean",
+                "description": "Type: `boolean_true`, default: `false`. Output FASTA to standard output even on FASTQ input",
+                "help_text": "Type: `boolean_true`, default: `false`. Output FASTA to standard output even on FASTQ input.\n"
+            ,
+                "default": "False"
+            }
+    
+
+        ,
+                "info_file": {
+                "type":
+                "boolean",
+                "description": "Type: `boolean_true`, default: `false`. Write information about each read and its adapter matches\ninto info",
+                "help_text": "Type: `boolean_true`, default: `false`. Write information about each read and its adapter matches\ninto info.txt in the output directory.\nSee the documentation for the file format.\n"
+            ,
+                "default": "False"
+            }
+    
+
+}
+},
+    
+    
+    "debug" : {
+    "title": "Debug",
+    "type": "object",
+    "description": "No description",
+    "properties": {
+    
+        
+                "debug": {
+                "type":
+                "boolean",
+                "description": "Type: `boolean_true`, default: `false`. Print debug information",
+                "help_text": "Type: `boolean_true`, default: `false`. Print debug information"
+            ,
+                "default": "False"
+            }
+    
+
+}
+},
+    
+    
+    "nextflow input-output arguments" : {
+    "title": "Nextflow input-output arguments",
+    "type": "object",
+    "description": "Input/output parameters for Nextflow itself. Please note that both publishDir and publish_dir are supported but at least one has to be configured.",
+    "properties": {
+    
+        
+                "publish_dir": {
+                "type":
+                "string",
+                "description": "Type: `string`, required, example: `output/`. Path to an output directory",
+                "help_text": "Type: `string`, required, example: `output/`. Path to an output directory."
+            
+            }
+    
+
+        ,
+                "param_list": {
+                "type":
+                "string",
+                "description": "Type: `string`, example: `my_params.yaml`. Allows inputting multiple parameter sets to initialise a Nextflow channel",
+                "help_text": "Type: `string`, example: `my_params.yaml`. Allows inputting multiple parameter sets to initialise a Nextflow channel. A `param_list` can either be a list of maps, a csv file, a json file, a yaml file, or simply a yaml blob.\n\n* A list of maps (as-is) where the keys of each map corresponds to the arguments of the pipeline. Example: in a `nextflow.config` file: `param_list: [ [\u0027id\u0027: \u0027foo\u0027, \u0027input\u0027: \u0027foo.txt\u0027], [\u0027id\u0027: \u0027bar\u0027, \u0027input\u0027: \u0027bar.txt\u0027] ]`.\n* A csv file should have column names which correspond to the different arguments of this pipeline. Example: `--param_list data.csv` with columns `id,input`.\n* A json or a yaml file should be a list of maps, each of which has keys corresponding to the arguments of the pipeline. Example: `--param_list data.json` with contents `[ {\u0027id\u0027: \u0027foo\u0027, \u0027input\u0027: \u0027foo.txt\u0027}, {\u0027id\u0027: \u0027bar\u0027, \u0027input\u0027: \u0027bar.txt\u0027} ]`.\n* A yaml blob can also be passed directly as a string. Example: `--param_list \"[ {\u0027id\u0027: \u0027foo\u0027, \u0027input\u0027: \u0027foo.txt\u0027}, {\u0027id\u0027: \u0027bar\u0027, \u0027input\u0027: \u0027bar.txt\u0027} ]\"`.\n\nWhen passing a csv, json or yaml file, relative path names are relativized to the location of the parameter file. No relativation is performed when `param_list` is a list of maps (as-is) or a yaml blob.",
+                "hidden": true
+            
+            }
+    
+
+}
+}
+},
+"allOf": [
+
+    {
+    "$ref": "#/definitions/specify adapters for r1"
+    },
+
+    {
+    "$ref": "#/definitions/specify adapters using fasta files for r1"
+    },
+
+    {
+    "$ref": "#/definitions/specify adapters for r2"
+    },
+
+    {
+    "$ref": "#/definitions/specify adapters using fasta files for r2"
+    },
+
+    {
+    "$ref": "#/definitions/paired-end options"
+    },
+
+    {
+    "$ref": "#/definitions/input parameters"
+    },
+
+    {
+    "$ref": "#/definitions/demultiplexing options"
+    },
+
+    {
+    "$ref": "#/definitions/read modifications"
+    },
+
+    {
+    "$ref": "#/definitions/filtering of processed reads"
+    },
+
+    {
+    "$ref": "#/definitions/output parameters"
+    },
+
+    {
+    "$ref": "#/definitions/debug"
+    },
+
+    {
+    "$ref": "#/definitions/nextflow input-output arguments"
+    }
+]
+}
--- a/target/dependencies/vsh/vsh/craftbox/v0.1.0/nextflow/concat_text/.config.vsh.yaml
+++ b/target/dependencies/vsh/vsh/craftbox/v0.1.0/nextflow/concat_text/.config.vsh.yaml
@@ -0,0 +1,197 @@
+name: "concat_text"
+version: "v0.1.0"
+authors:
+- name: "Toni Verbeiren"
+  roles:
+  - "author"
+  - "maintainer"
+  info:
+    links:
+      github: "tverbeiren"
+      linkedin: "verbeiren"
+    organizations:
+    - name: "Data Intuitive"
+      href: "https://www.data-intuitive.com"
+      role: "Data Scientist and CEO"
+- name: "Dries Schaumont"
+  roles:
+  - "reviewer"
+  info:
+    links:
+      email: "dries@data-intuitive.com"
+      github: "DriesSchaumont"
+      orcid: "0000-0002-4389-0440"
+      linkedin: "dries-schaumont"
+    organizations:
+    - name: "Data Intuitive"
+      href: "https://www.data-intuitive.com"
+      role: "Data Scientist"
+argument_groups:
+- name: "Input arguments"
+  arguments:
+  - type: "file"
+    name: "--input"
+    description: "A list of (gzipped) text files."
+    info: null
+    example:
+    - "input?.txt.gz"
+    must_exist: true
+    create_parent: true
+    required: true
+    direction: "input"
+    multiple: true
+    multiple_sep: ";"
+- name: "Output arguments"
+  arguments:
+  - type: "boolean_true"
+    name: "--gzip_output"
+    description: "Should the output be zipped?"
+    info: null
+    direction: "input"
+  - type: "file"
+    name: "--output"
+    description: "File to write the output to, optionally gzipped."
+    info: null
+    example:
+    - "output.txt"
+    must_exist: true
+    create_parent: true
+    required: false
+    direction: "output"
+    multiple: false
+    multiple_sep: ";"
+resources:
+- type: "bash_script"
+  path: "script.sh"
+  is_executable: true
+description: "Concatenate a number of text files, handle gzipped text files gracefully\
+  \ and\noptionally gzip the output text file.\n\nThis component is useful for concatening\
+  \ fastq files from different lanes, for instance.\n"
+test_resources:
+- type: "bash_script"
+  path: "test.sh"
+  is_executable: true
+info:
+  improvements: "This component could be improved in 2 ways:\n  1. Allow for a mix\
+    \ of zipped and plain input files\n  2. Allow to specify a compression algorithm\
+    \ for the output\n"
+status: "enabled"
+requirements:
+  commands:
+  - "ps"
+license: "MIT"
+links:
+  repository: "https://github.com/viash-hub/craftbox"
+runners:
+- type: "executable"
+  id: "executable"
+  docker_setup_strategy: "ifneedbepullelsecachedbuild"
+- type: "nextflow"
+  id: "nextflow"
+  directives:
+    tag: "$id"
+  auto:
+    simplifyInput: true
+    simplifyOutput: false
+    transcript: false
+    publish: false
+  config:
+    labels:
+      mem1gb: "memory = 1000000000.B"
+      mem2gb: "memory = 2000000000.B"
+      mem5gb: "memory = 5000000000.B"
+      mem10gb: "memory = 10000000000.B"
+      mem20gb: "memory = 20000000000.B"
+      mem50gb: "memory = 50000000000.B"
+      mem100gb: "memory = 100000000000.B"
+      mem200gb: "memory = 200000000000.B"
+      mem500gb: "memory = 500000000000.B"
+      mem1tb: "memory = 1000000000000.B"
+      mem2tb: "memory = 2000000000000.B"
+      mem5tb: "memory = 5000000000000.B"
+      mem10tb: "memory = 10000000000000.B"
+      mem20tb: "memory = 20000000000000.B"
+      mem50tb: "memory = 50000000000000.B"
+      mem100tb: "memory = 100000000000000.B"
+      mem200tb: "memory = 200000000000000.B"
+      mem500tb: "memory = 500000000000000.B"
+      mem1gib: "memory = 1073741824.B"
+      mem2gib: "memory = 2147483648.B"
+      mem4gib: "memory = 4294967296.B"
+      mem8gib: "memory = 8589934592.B"
+      mem16gib: "memory = 17179869184.B"
+      mem32gib: "memory = 34359738368.B"
+      mem64gib: "memory = 68719476736.B"
+      mem128gib: "memory = 137438953472.B"
+      mem256gib: "memory = 274877906944.B"
+      mem512gib: "memory = 549755813888.B"
+      mem1tib: "memory = 1099511627776.B"
+      mem2tib: "memory = 2199023255552.B"
+      mem4tib: "memory = 4398046511104.B"
+      mem8tib: "memory = 8796093022208.B"
+      mem16tib: "memory = 17592186044416.B"
+      mem32tib: "memory = 35184372088832.B"
+      mem64tib: "memory = 70368744177664.B"
+      mem128tib: "memory = 140737488355328.B"
+      mem256tib: "memory = 281474976710656.B"
+      mem512tib: "memory = 562949953421312.B"
+      cpu1: "cpus = 1"
+      cpu2: "cpus = 2"
+      cpu5: "cpus = 5"
+      cpu10: "cpus = 10"
+      cpu20: "cpus = 20"
+      cpu50: "cpus = 50"
+      cpu100: "cpus = 100"
+      cpu200: "cpus = 200"
+      cpu500: "cpus = 500"
+      cpu1000: "cpus = 1000"
+  debug: false
+  container: "docker"
+engines:
+- type: "docker"
+  id: "docker"
+  image: "alpine:latest"
+  target_registry: "images.viash-hub.com"
+  target_tag: "v0.1.0"
+  namespace_separator: "/"
+  setup:
+  - type: "apk"
+    packages:
+    - "bash"
+    - "procps"
+    - "file"
+  entrypoint: []
+  cmd: null
+- type: "native"
+  id: "native"
+build_info:
+  config: "src/concat_text/config.vsh.yaml"
+  runner: "nextflow"
+  engine: "docker|native"
+  output: "target/nextflow/concat_text"
+  executable: "target/nextflow/concat_text/main.nf"
+  viash_version: "0.9.0"
+  git_commit: "3143dd6e4c2c3107f79639fe8602d92f3141ad82"
+  git_remote: "https://github.com/viash-hub/craftbox"
+package_config:
+  name: "craftbox"
+  version: "v0.1.0"
+  description: "A collection of custom-tailored scripts and applied tools.\n"
+  info: null
+  viash_version: "0.9.0"
+  source: "src"
+  target: "target"
+  config_mods:
+  - ".requirements.commands := ['ps']\n"
+  - ".engines += { type: \"native\" }"
+  - ".engines[.type == 'docker'].target_registry := 'images.viash-hub.com'"
+  - ".engines[.type == 'docker'].target_tag := 'v0.1.0'"
+  keywords:
+  - "scripts"
+  - "custom"
+  - "implementations"
+  license: "MIT"
+  organization: "vsh"
+  links:
+    repository: "https://github.com/viash-hub/craftbox"
+    issue_tracker: "https://github.com/viash-hub/craftbox/issues"
--- a/target/dependencies/vsh/vsh/craftbox/v0.1.0/nextflow/concat_text/main.nf
+++ b/target/dependencies/vsh/vsh/craftbox/v0.1.0/nextflow/concat_text/main.nf
--- a/target/dependencies/vsh/vsh/craftbox/v0.1.0/nextflow/concat_text/nextflow.config
+++ b/target/dependencies/vsh/vsh/craftbox/v0.1.0/nextflow/concat_text/nextflow.config
@@ -0,0 +1,126 @@
+manifest {
+  name = 'concat_text'
+  mainScript = 'main.nf'
+  nextflowVersion = '!>=20.12.1-edge'
+  version = 'v0.1.0'
+  description = 'Concatenate a number of text files, handle gzipped text files gracefully and\noptionally gzip the output text file.\n\nThis component is useful for concatening fastq files from different lanes, for instance.\n'
+  author = 'Toni Verbeiren, Dries Schaumont'
+}
+
+process.container = 'nextflow/bash:latest'
+
+// detect tempdir
+tempDir = java.nio.file.Paths.get(
+  System.getenv('NXF_TEMP') ?:
+    System.getenv('VIASH_TEMP') ?: 
+    System.getenv('TEMPDIR') ?: 
+    System.getenv('TMPDIR') ?: 
+    '/tmp'
+).toAbsolutePath()
+
+profiles {
+  no_publish {
+    process {
+      withName: '.*' {
+        publishDir = [
+          enabled: false
+        ]
+      }
+    }
+  }
+  mount_temp {
+    docker.temp            = tempDir
+    podman.temp            = tempDir
+    charliecloud.temp      = tempDir
+  }
+  docker {
+    docker.enabled         = true
+    // docker.userEmulation   = true
+    singularity.enabled    = false
+    podman.enabled         = false
+    shifter.enabled        = false
+    charliecloud.enabled   = false
+  }
+  singularity {
+    singularity.enabled    = true
+    singularity.autoMounts = true
+    docker.enabled         = false
+    podman.enabled         = false
+    shifter.enabled        = false
+    charliecloud.enabled   = false
+  }
+  podman {
+    podman.enabled         = true
+    docker.enabled         = false
+    singularity.enabled    = false
+    shifter.enabled        = false
+    charliecloud.enabled   = false
+  }
+  shifter {
+    shifter.enabled        = true
+    docker.enabled         = false
+    singularity.enabled    = false
+    podman.enabled         = false
+    charliecloud.enabled   = false
+  }
+  charliecloud {
+    charliecloud.enabled   = true
+    docker.enabled         = false
+    singularity.enabled    = false
+    podman.enabled         = false
+    shifter.enabled        = false
+  }
+}
+
+process{
+  withLabel: mem1gb { memory = 1000000000.B }
+  withLabel: mem2gb { memory = 2000000000.B }
+  withLabel: mem5gb { memory = 5000000000.B }
+  withLabel: mem10gb { memory = 10000000000.B }
+  withLabel: mem20gb { memory = 20000000000.B }
+  withLabel: mem50gb { memory = 50000000000.B }
+  withLabel: mem100gb { memory = 100000000000.B }
+  withLabel: mem200gb { memory = 200000000000.B }
+  withLabel: mem500gb { memory = 500000000000.B }
+  withLabel: mem1tb { memory = 1000000000000.B }
+  withLabel: mem2tb { memory = 2000000000000.B }
+  withLabel: mem5tb { memory = 5000000000000.B }
+  withLabel: mem10tb { memory = 10000000000000.B }
+  withLabel: mem20tb { memory = 20000000000000.B }
+  withLabel: mem50tb { memory = 50000000000000.B }
+  withLabel: mem100tb { memory = 100000000000000.B }
+  withLabel: mem200tb { memory = 200000000000000.B }
+  withLabel: mem500tb { memory = 500000000000000.B }
+  withLabel: mem1gib { memory = 1073741824.B }
+  withLabel: mem2gib { memory = 2147483648.B }
+  withLabel: mem4gib { memory = 4294967296.B }
+  withLabel: mem8gib { memory = 8589934592.B }
+  withLabel: mem16gib { memory = 17179869184.B }
+  withLabel: mem32gib { memory = 34359738368.B }
+  withLabel: mem64gib { memory = 68719476736.B }
+  withLabel: mem128gib { memory = 137438953472.B }
+  withLabel: mem256gib { memory = 274877906944.B }
+  withLabel: mem512gib { memory = 549755813888.B }
+  withLabel: mem1tib { memory = 1099511627776.B }
+  withLabel: mem2tib { memory = 2199023255552.B }
+  withLabel: mem4tib { memory = 4398046511104.B }
+  withLabel: mem8tib { memory = 8796093022208.B }
+  withLabel: mem16tib { memory = 17592186044416.B }
+  withLabel: mem32tib { memory = 35184372088832.B }
+  withLabel: mem64tib { memory = 70368744177664.B }
+  withLabel: mem128tib { memory = 140737488355328.B }
+  withLabel: mem256tib { memory = 281474976710656.B }
+  withLabel: mem512tib { memory = 562949953421312.B }
+  withLabel: cpu1 { cpus = 1 }
+  withLabel: cpu2 { cpus = 2 }
+  withLabel: cpu5 { cpus = 5 }
+  withLabel: cpu10 { cpus = 10 }
+  withLabel: cpu20 { cpus = 20 }
+  withLabel: cpu50 { cpus = 50 }
+  withLabel: cpu100 { cpus = 100 }
+  withLabel: cpu200 { cpus = 200 }
+  withLabel: cpu500 { cpus = 500 }
+  withLabel: cpu1000 { cpus = 1000 }
+}
+
+
--- a/target/dependencies/vsh/vsh/craftbox/v0.1.0/nextflow/concat_text/nextflow_schema.json
+++ b/target/dependencies/vsh/vsh/craftbox/v0.1.0/nextflow/concat_text/nextflow_schema.json
@@ -0,0 +1,106 @@
+{
+"$schema": "http://json-schema.org/draft-07/schema",
+"title": "concat_text",
+"description": "Concatenate a number of text files, handle gzipped text files gracefully and\noptionally gzip the output text file.\n\nThis component is useful for concatening fastq files from different lanes, for instance.\n",
+"type": "object",
+"definitions": {
+
+    
+    
+    "input arguments" : {
+    "title": "Input arguments",
+    "type": "object",
+    "description": "No description",
+    "properties": {
+    
+        
+                "input": {
+                "type":
+                "string",
+                "description": "Type: List of `file`, required, example: `input?.txt.gz`, multiple_sep: `\";\"`. A list of (gzipped) text files",
+                "help_text": "Type: List of `file`, required, example: `input?.txt.gz`, multiple_sep: `\";\"`. A list of (gzipped) text files."
+            
+            }
+    
+
+}
+},
+    
+    
+    "output arguments" : {
+    "title": "Output arguments",
+    "type": "object",
+    "description": "No description",
+    "properties": {
+    
+        
+                "gzip_output": {
+                "type":
+                "boolean",
+                "description": "Type: `boolean_true`, default: `false`. Should the output be zipped?",
+                "help_text": "Type: `boolean_true`, default: `false`. Should the output be zipped?"
+            ,
+                "default": "False"
+            }
+    
+
+        ,
+                "output": {
+                "type":
+                "string",
+                "description": "Type: `file`, default: `$id.$key.output.txt`, example: `output.txt`. File to write the output to, optionally gzipped",
+                "help_text": "Type: `file`, default: `$id.$key.output.txt`, example: `output.txt`. File to write the output to, optionally gzipped."
+            ,
+                "default": "$id.$key.output.txt"
+            }
+    
+
+}
+},
+    
+    
+    "nextflow input-output arguments" : {
+    "title": "Nextflow input-output arguments",
+    "type": "object",
+    "description": "Input/output parameters for Nextflow itself. Please note that both publishDir and publish_dir are supported but at least one has to be configured.",
+    "properties": {
+    
+        
+                "publish_dir": {
+                "type":
+                "string",
+                "description": "Type: `string`, required, example: `output/`. Path to an output directory",
+                "help_text": "Type: `string`, required, example: `output/`. Path to an output directory."
+            
+            }
+    
+
+        ,
+                "param_list": {
+                "type":
+                "string",
+                "description": "Type: `string`, example: `my_params.yaml`. Allows inputting multiple parameter sets to initialise a Nextflow channel",
+                "help_text": "Type: `string`, example: `my_params.yaml`. Allows inputting multiple parameter sets to initialise a Nextflow channel. A `param_list` can either be a list of maps, a csv file, a json file, a yaml file, or simply a yaml blob.\n\n* A list of maps (as-is) where the keys of each map corresponds to the arguments of the pipeline. Example: in a `nextflow.config` file: `param_list: [ [\u0027id\u0027: \u0027foo\u0027, \u0027input\u0027: \u0027foo.txt\u0027], [\u0027id\u0027: \u0027bar\u0027, \u0027input\u0027: \u0027bar.txt\u0027] ]`.\n* A csv file should have column names which correspond to the different arguments of this pipeline. Example: `--param_list data.csv` with columns `id,input`.\n* A json or a yaml file should be a list of maps, each of which has keys corresponding to the arguments of the pipeline. Example: `--param_list data.json` with contents `[ {\u0027id\u0027: \u0027foo\u0027, \u0027input\u0027: \u0027foo.txt\u0027}, {\u0027id\u0027: \u0027bar\u0027, \u0027input\u0027: \u0027bar.txt\u0027} ]`.\n* A yaml blob can also be passed directly as a string. Example: `--param_list \"[ {\u0027id\u0027: \u0027foo\u0027, \u0027input\u0027: \u0027foo.txt\u0027}, {\u0027id\u0027: \u0027bar\u0027, \u0027input\u0027: \u0027bar.txt\u0027} ]\"`.\n\nWhen passing a csv, json or yaml file, relative path names are relativized to the location of the parameter file. No relativation is performed when `param_list` is a list of maps (as-is) or a yaml blob.",
+                "hidden": true
+            
+            }
+    
+
+}
+}
+},
+"allOf": [
+
+    {
+    "$ref": "#/definitions/input arguments"
+    },
+
+    {
+    "$ref": "#/definitions/output arguments"
+    },
+
+    {
+    "$ref": "#/definitions/nextflow input-output arguments"
+    }
+]
+}
--- a/target/executable/eset/create_eset/.config.vsh.yaml
+++ b/target/executable/eset/create_eset/.config.vsh.yaml
@@ -0,0 +1,254 @@
+name: "create_eset"
+namespace: "eset"
+version: "fix_20"
+authors:
+- name: "Dries Schaumont"
+  roles:
+  - "maintainer"
+  info:
+    links:
+      email: "dries@data-intuitive.com"
+      github: "DriesSchaumont"
+      orcid: "0000-0002-4389-0440"
+      linkedin: "dries-schaumont"
+    organizations:
+    - name: "Data Intuitive"
+      href: "https://www.data-intuitive.com"
+      role: "Data Scientist"
+- name: "Marijke Van Moerbeke"
+  roles:
+  - "author"
+  info:
+    links:
+      github: "mvanmoerbeke"
+      orcid: "0000-0002-3097-5621"
+      linkedin: "marijke-van-moerbeke-84303a34"
+    organizations:
+    - name: "OpenAnalytics"
+      href: "https://www.openanalytics.eu"
+      role: "Statistical Consultant"
+argument_groups:
+- name: "Arguments"
+  arguments:
+  - type: "file"
+    name: "--pDataFile"
+    info: null
+    must_exist: true
+    create_parent: true
+    required: true
+    direction: "input"
+    multiple: false
+    multiple_sep: ";"
+  - type: "file"
+    name: "--fDataFile"
+    info: null
+    must_exist: true
+    create_parent: true
+    required: true
+    direction: "input"
+    multiple: false
+    multiple_sep: ";"
+  - type: "file"
+    name: "--mappingDir"
+    info: null
+    must_exist: true
+    create_parent: true
+    required: true
+    direction: "input"
+    multiple: true
+    multiple_sep: ";"
+  - type: "string"
+    name: "--poolName"
+    info: null
+    required: true
+    direction: "input"
+    multiple: false
+    multiple_sep: ";"
+  - type: "file"
+    name: "--output"
+    info: null
+    default:
+    - "eset.$id.rds"
+    must_exist: true
+    create_parent: true
+    required: true
+    direction: "output"
+    multiple: false
+    multiple_sep: ";"
+resources:
+- type: "r_script"
+  path: "script.R"
+  is_executable: true
+- type: "file"
+  path: "nextflow_labels.config"
+  dest: "nextflow_labels.config"
+test_resources:
+- type: "r_script"
+  path: "test.R"
+  is_executable: true
+- type: "file"
+  path: "pData.tsv"
+- type: "file"
+  path: "fData.tsv"
+- type: "file"
+  path: "mapping_dir"
+info: null
+status: "enabled"
+requirements:
+  commands:
+  - "ps"
+license: "MIT"
+links:
+  repository: "https://github.com/viash-hub/htrnaseq"
+runners:
+- type: "executable"
+  id: "executable"
+  docker_setup_strategy: "ifneedbepullelsecachedbuild"
+- type: "nextflow"
+  id: "nextflow"
+  directives:
+    tag: "$id"
+  auto:
+    simplifyInput: true
+    simplifyOutput: false
+    transcript: false
+    publish: false
+  config:
+    labels:
+      mem1gb: "memory = 1000000000.B"
+      mem2gb: "memory = 2000000000.B"
+      mem5gb: "memory = 5000000000.B"
+      mem10gb: "memory = 10000000000.B"
+      mem20gb: "memory = 20000000000.B"
+      mem50gb: "memory = 50000000000.B"
+      mem100gb: "memory = 100000000000.B"
+      mem200gb: "memory = 200000000000.B"
+      mem500gb: "memory = 500000000000.B"
+      mem1tb: "memory = 1000000000000.B"
+      mem2tb: "memory = 2000000000000.B"
+      mem5tb: "memory = 5000000000000.B"
+      mem10tb: "memory = 10000000000000.B"
+      mem20tb: "memory = 20000000000000.B"
+      mem50tb: "memory = 50000000000000.B"
+      mem100tb: "memory = 100000000000000.B"
+      mem200tb: "memory = 200000000000000.B"
+      mem500tb: "memory = 500000000000000.B"
+      mem1gib: "memory = 1073741824.B"
+      mem2gib: "memory = 2147483648.B"
+      mem4gib: "memory = 4294967296.B"
+      mem8gib: "memory = 8589934592.B"
+      mem16gib: "memory = 17179869184.B"
+      mem32gib: "memory = 34359738368.B"
+      mem64gib: "memory = 68719476736.B"
+      mem128gib: "memory = 137438953472.B"
+      mem256gib: "memory = 274877906944.B"
+      mem512gib: "memory = 549755813888.B"
+      mem1tib: "memory = 1099511627776.B"
+      mem2tib: "memory = 2199023255552.B"
+      mem4tib: "memory = 4398046511104.B"
+      mem8tib: "memory = 8796093022208.B"
+      mem16tib: "memory = 17592186044416.B"
+      mem32tib: "memory = 35184372088832.B"
+      mem64tib: "memory = 70368744177664.B"
+      mem128tib: "memory = 140737488355328.B"
+      mem256tib: "memory = 281474976710656.B"
+      mem512tib: "memory = 562949953421312.B"
+      cpu1: "cpus = 1"
+      cpu2: "cpus = 2"
+      cpu5: "cpus = 5"
+      cpu10: "cpus = 10"
+      cpu20: "cpus = 20"
+      cpu50: "cpus = 50"
+      cpu100: "cpus = 100"
+      cpu200: "cpus = 200"
+      cpu500: "cpus = 500"
+      cpu1000: "cpus = 1000"
+    script:
+    - "includeConfig(\"nextflow_labels.config\")"
+  debug: false
+  container: "docker"
+engines:
+- type: "docker"
+  id: "docker"
+  image: "r-base:4.3.0"
+  target_registry: "images.viash-hub.com"
+  target_tag: "fix_20"
+  namespace_separator: "/"
+  setup:
+  - type: "apt"
+    packages:
+    - "libcurl4-openssl-dev"
+    - "libssl-dev"
+    - "libxml2-dev"
+    - "libfftw3-dev"
+    - "libfontconfig1-dev"
+    - "libfreetype-dev"
+    - "libhdf5-dev"
+    - "bzip2"
+    - "libharfbuzz-dev"
+    - "libfribidi-dev"
+    - "libtiff-dev"
+    - "libgsl-dev"
+    - "libcairo-dev"
+    - "libudunits2-dev"
+    - "procps"
+    interactive: false
+  - type: "r"
+    cran:
+    - "nlcv"
+    bioc:
+    - "Biobase"
+    - "limma"
+    - "a4Core"
+    - "MLInterfaces"
+    - "multtest"
+    script:
+    - "remotes::install_url(\"https://cran.r-project.org/src/contrib/Archive/Matrix/Matrix_1.6-5.tar.gz\"\
+      , dependencies=TRUE, upgrade_dependencies=FALSE);\\\nremotes::install_url(\"\
+      https://cran.r-project.org/src/contrib/Archive/Seurat/Seurat_4.4.0.tar.gz\"\
+      , repos=BiocManager::repositories(), dependencies=TRUE, upgrade_dependencies=FALSE)\\\
+      \n"
+    bioc_force_install: false
+  entrypoint: []
+  cmd: null
+- type: "native"
+  id: "native"
+build_info:
+  config: "src/eset/create_eset/config.vsh.yaml"
+  runner: "executable"
+  engine: "docker|native"
+  output: "target/executable/eset/create_eset"
+  executable: "target/executable/eset/create_eset/create_eset"
+  viash_version: "0.9.0"
+  git_commit: "7ae6be67bd6159c27c4b50f0b2c1a56ec92b60c4"
+  git_remote: "https://x-access-token:ghs_HTVAxaMSg3PwvfZSHmJuMhxtV6WV4O2VfvsR@github.com/viash-hub/htrnaseq"
+package_config:
+  name: "htrnaseq"
+  version: "fix_20"
+  description: "High-throughput pipeline [WIP]\n"
+  info:
+    test_resources:
+    - path: "gs://viash-hub-test-data/htrnaseq/v1/"
+      dest: "resources_test"
+  viash_version: "0.9.0"
+  source: "src"
+  target: "target"
+  config_mods:
+  - ".requirements.commands := ['ps']\n.runners[.type == 'nextflow'].config.script\
+    \ := 'includeConfig(\"nextflow_labels.config\")'\n.resources += {path: '/src/config/labels.config',\
+    \ dest: 'nextflow_labels.config'}\n"
+  - ".engines += { type: \"native\" }"
+  - ".engines[.type == 'docker'].target_registry := 'images.viash-hub.com'"
+  - ".engines[.type == 'docker'].target_tag := 'fix_20'"
+  keywords:
+  - "bioinformatics"
+  - "sequence"
+  - "high-throughput"
+  - "mapping"
+  - "counting"
+  - "pipeline"
+  license: "MIT"
+  organization: "vsh"
+  links:
+    repository: "https://github.com/viash-hub/htrnaseq"
+    issue_tracker: "https://github.com/viash-hub/htrnaseq/issues"
--- a/target/executable/eset/create_eset/create_eset
+++ b/target/executable/eset/create_eset/create_eset
--- a/target/executable/eset/create_eset/nextflow_labels.config
+++ b/target/executable/eset/create_eset/nextflow_labels.config
@@ -0,0 +1,108 @@
+executor {
+  $k8s {
+    submitRateLimit = '10sec'
+    pollInterval = '1 sec'
+  }
+}
+
+process {
+  container = 'nextflow/bash:latest'
+  
+  // default resources
+  memory = { 8.Gb * task.attempt }
+  cpus = 8
+  maxForks = 36
+
+  // Retry for exit codes that have something to do with memory issues
+  errorStrategy = { task.exitStatus in 137..140 ? 'retry' : 'terminate' }
+  maxRetries = 3
+  maxMemory = 192.GB
+
+  // Resource labels
+  withLabel: verylowcpu { cpus = 2 }
+  withLabel: lowcpu { cpus = 8 }
+  withLabel: midcpu { cpus = 16 }
+  withLabel: highcpu { cpus = 32 }
+  
+  withLabel: verylowmem { memory = { get_memory( 4.GB * task.attempt ) } }
+  withLabel: lowmem { memory = { get_memory( 8.GB * task.attempt ) } }
+  withLabel: midmem { memory = { get_memory( 16.GB * task.attempt ) } }
+  withLabel: highmem { memory = { get_memory( 64.GB * task.attempt ) } }
+
+}
+
+profiles {
+  // detect tempdir
+  tempDir = java.nio.file.Paths.get(
+    System.getenv('NXF_TEMP') ?:
+      System.getenv('VIASH_TEMP') ?: 
+      System.getenv('TEMPDIR') ?: 
+      System.getenv('TMPDIR') ?: 
+      '/tmp'
+  ).toAbsolutePath()
+
+  mount_temp {
+    docker.temp            = tempDir
+    podman.temp            = tempDir
+    charliecloud.temp      = tempDir
+  }
+
+  no_publish {
+    process {
+      withName: '.*' {
+        publishDir = [
+          enabled: false
+        ]
+      }
+    }
+  }
+
+  docker {
+    docker.fixOwnership    = true
+    docker.enabled         = true
+    // docker.userEmulation   = true
+    singularity.enabled    = false
+    podman.enabled         = false
+    shifter.enabled        = false
+    charliecloud.enabled   = false
+  }
+
+  local {
+    // This config is for local processing.
+    process {
+        withName: ".*parallel_map_process" {
+          maxForks = 1
+        }
+        maxMemory = 25.GB
+        withLabel: verylowcpu { cpus = 2 }
+        withLabel: lowcpu { cpus = 4 }
+        withLabel: midcpu { cpus = 6 }
+        withLabel: highcpu { cpus = 8 }
+  
+        withLabel: lowmem { memory = { get_memory( 8.GB * task.attempt ) } }
+        withLabel: midmem { memory = { get_memory( 12.GB * task.attempt ) } }
+        withLabel: highmem { memory = { get_memory( 20.GB * task.attempt ) } }
+    }
+  }
+}
+
+def get_memory(to_compare) {
+    if (!process.containsKey("maxMemory") || !process.maxMemory) {
+      return to_compare
+    }
+
+    try {
+      if (process.containsKey("maxRetries") && process.maxRetries && task.attempt == (process.maxRetries as int)) {
+        return process.maxMemory
+      }
+      else if (to_compare.compareTo(process.maxMemory as nextflow.util.MemoryUnit) == 1) {
+        return max_memory as nextflow.util.MemoryUnit
+      }
+      else {
+        return to_compare
+      }  
+    } catch (all) {
+          println "Error processing memory resources. Please check that process.maxMemory '${process.maxMemory}' and process.maxRetries '${process.maxRetries}' are valid!"
+          System.exit(1)
+    }
+  }
--- a/target/executable/eset/create_fdata/.config.vsh.yaml
+++ b/target/executable/eset/create_fdata/.config.vsh.yaml
@@ -0,0 +1,211 @@
+name: "create_fdata"
+namespace: "eset"
+version: "fix_20"
+authors:
+- name: "Dries Schaumont"
+  roles:
+  - "maintainer"
+  info:
+    links:
+      email: "dries@data-intuitive.com"
+      github: "DriesSchaumont"
+      orcid: "0000-0002-4389-0440"
+      linkedin: "dries-schaumont"
+    organizations:
+    - name: "Data Intuitive"
+      href: "https://www.data-intuitive.com"
+      role: "Data Scientist"
+- name: "Marijke Van Moerbeke"
+  roles:
+  - "contributor"
+  info:
+    links:
+      github: "mvanmoerbeke"
+      orcid: "0000-0002-3097-5621"
+      linkedin: "marijke-van-moerbeke-84303a34"
+    organizations:
+    - name: "OpenAnalytics"
+      href: "https://www.openanalytics.eu"
+      role: "Statistical Consultant"
+argument_groups:
+- name: "Arguments"
+  arguments:
+  - type: "file"
+    name: "--gtf"
+    description: "Genome annotation file in GTF format."
+    info: null
+    must_exist: true
+    create_parent: true
+    required: true
+    direction: "input"
+    multiple: false
+    multiple_sep: ";"
+  - type: "file"
+    name: "--output"
+    description: "Tab-delimited text file containing information about the 'gene'\
+      \ or 'transcript'\nentries from the input GTF file. The 'transcript' entries\
+      \ are used in case the source\nof the GTF was 'refGene' or 'ncbiRefSeq'. \n"
+    info: null
+    default:
+    - "fData.$id.txt"
+    must_exist: true
+    create_parent: true
+    required: false
+    direction: "output"
+    multiple: false
+    multiple_sep: ";"
+resources:
+- type: "python_script"
+  path: "create_fdata.py"
+  is_executable: true
+- type: "file"
+  path: "nextflow_labels.config"
+  dest: "nextflow_labels.config"
+description: "Create a fdata file\n"
+test_resources:
+- type: "python_script"
+  path: "test.py"
+  is_executable: true
+- type: "file"
+  path: "test_annotation.gtf"
+info: null
+status: "enabled"
+requirements:
+  commands:
+  - "ps"
+license: "MIT"
+links:
+  repository: "https://github.com/viash-hub/htrnaseq"
+runners:
+- type: "executable"
+  id: "executable"
+  docker_setup_strategy: "ifneedbepullelsecachedbuild"
+- type: "nextflow"
+  id: "nextflow"
+  directives:
+    tag: "$id"
+  auto:
+    simplifyInput: true
+    simplifyOutput: false
+    transcript: false
+    publish: false
+  config:
+    labels:
+      mem1gb: "memory = 1000000000.B"
+      mem2gb: "memory = 2000000000.B"
+      mem5gb: "memory = 5000000000.B"
+      mem10gb: "memory = 10000000000.B"
+      mem20gb: "memory = 20000000000.B"
+      mem50gb: "memory = 50000000000.B"
+      mem100gb: "memory = 100000000000.B"
+      mem200gb: "memory = 200000000000.B"
+      mem500gb: "memory = 500000000000.B"
+      mem1tb: "memory = 1000000000000.B"
+      mem2tb: "memory = 2000000000000.B"
+      mem5tb: "memory = 5000000000000.B"
+      mem10tb: "memory = 10000000000000.B"
+      mem20tb: "memory = 20000000000000.B"
+      mem50tb: "memory = 50000000000000.B"
+      mem100tb: "memory = 100000000000000.B"
+      mem200tb: "memory = 200000000000000.B"
+      mem500tb: "memory = 500000000000000.B"
+      mem1gib: "memory = 1073741824.B"
+      mem2gib: "memory = 2147483648.B"
+      mem4gib: "memory = 4294967296.B"
+      mem8gib: "memory = 8589934592.B"
+      mem16gib: "memory = 17179869184.B"
+      mem32gib: "memory = 34359738368.B"
+      mem64gib: "memory = 68719476736.B"
+      mem128gib: "memory = 137438953472.B"
+      mem256gib: "memory = 274877906944.B"
+      mem512gib: "memory = 549755813888.B"
+      mem1tib: "memory = 1099511627776.B"
+      mem2tib: "memory = 2199023255552.B"
+      mem4tib: "memory = 4398046511104.B"
+      mem8tib: "memory = 8796093022208.B"
+      mem16tib: "memory = 17592186044416.B"
+      mem32tib: "memory = 35184372088832.B"
+      mem64tib: "memory = 70368744177664.B"
+      mem128tib: "memory = 140737488355328.B"
+      mem256tib: "memory = 281474976710656.B"
+      mem512tib: "memory = 562949953421312.B"
+      cpu1: "cpus = 1"
+      cpu2: "cpus = 2"
+      cpu5: "cpus = 5"
+      cpu10: "cpus = 10"
+      cpu20: "cpus = 20"
+      cpu50: "cpus = 50"
+      cpu100: "cpus = 100"
+      cpu200: "cpus = 200"
+      cpu500: "cpus = 500"
+      cpu1000: "cpus = 1000"
+    script:
+    - "includeConfig(\"nextflow_labels.config\")"
+  debug: false
+  container: "docker"
+engines:
+- type: "docker"
+  id: "docker"
+  image: "python:3.12-slim"
+  target_registry: "images.viash-hub.com"
+  target_tag: "fix_20"
+  namespace_separator: "/"
+  setup:
+  - type: "apt"
+    packages:
+    - "procps"
+    interactive: false
+  - type: "python"
+    user: false
+    packages:
+    - "pandas"
+    upgrade: true
+  test_setup:
+  - type: "python"
+    user: false
+    packages:
+    - "viashpy"
+    upgrade: true
+  entrypoint: []
+  cmd: null
+- type: "native"
+  id: "native"
+build_info:
+  config: "src/eset/create_fdata/config.vsh.yaml"
+  runner: "executable"
+  engine: "docker|native"
+  output: "target/executable/eset/create_fdata"
+  executable: "target/executable/eset/create_fdata/create_fdata"
+  viash_version: "0.9.0"
+  git_commit: "7ae6be67bd6159c27c4b50f0b2c1a56ec92b60c4"
+  git_remote: "https://x-access-token:ghs_HTVAxaMSg3PwvfZSHmJuMhxtV6WV4O2VfvsR@github.com/viash-hub/htrnaseq"
+package_config:
+  name: "htrnaseq"
+  version: "fix_20"
+  description: "High-throughput pipeline [WIP]\n"
+  info:
+    test_resources:
+    - path: "gs://viash-hub-test-data/htrnaseq/v1/"
+      dest: "resources_test"
+  viash_version: "0.9.0"
+  source: "src"
+  target: "target"
+  config_mods:
+  - ".requirements.commands := ['ps']\n.runners[.type == 'nextflow'].config.script\
+    \ := 'includeConfig(\"nextflow_labels.config\")'\n.resources += {path: '/src/config/labels.config',\
+    \ dest: 'nextflow_labels.config'}\n"
+  - ".engines += { type: \"native\" }"
+  - ".engines[.type == 'docker'].target_registry := 'images.viash-hub.com'"
+  - ".engines[.type == 'docker'].target_tag := 'fix_20'"
+  keywords:
+  - "bioinformatics"
+  - "sequence"
+  - "high-throughput"
+  - "mapping"
+  - "counting"
+  - "pipeline"
+  license: "MIT"
+  organization: "vsh"
+  links:
+    repository: "https://github.com/viash-hub/htrnaseq"
+    issue_tracker: "https://github.com/viash-hub/htrnaseq/issues"
--- a/target/executable/eset/create_fdata/create_fdata
+++ b/target/executable/eset/create_fdata/create_fdata
--- a/target/executable/eset/create_fdata/nextflow_labels.config
+++ b/target/executable/eset/create_fdata/nextflow_labels.config
@@ -0,0 +1,108 @@
+executor {
+  $k8s {
+    submitRateLimit = '10sec'
+    pollInterval = '1 sec'
+  }
+}
+
+process {
+  container = 'nextflow/bash:latest'
+  
+  // default resources
+  memory = { 8.Gb * task.attempt }
+  cpus = 8
+  maxForks = 36
+
+  // Retry for exit codes that have something to do with memory issues
+  errorStrategy = { task.exitStatus in 137..140 ? 'retry' : 'terminate' }
+  maxRetries = 3
+  maxMemory = 192.GB
+
+  // Resource labels
+  withLabel: verylowcpu { cpus = 2 }
+  withLabel: lowcpu { cpus = 8 }
+  withLabel: midcpu { cpus = 16 }
+  withLabel: highcpu { cpus = 32 }
+  
+  withLabel: verylowmem { memory = { get_memory( 4.GB * task.attempt ) } }
+  withLabel: lowmem { memory = { get_memory( 8.GB * task.attempt ) } }
+  withLabel: midmem { memory = { get_memory( 16.GB * task.attempt ) } }
+  withLabel: highmem { memory = { get_memory( 64.GB * task.attempt ) } }
+
+}
+
+profiles {
+  // detect tempdir
+  tempDir = java.nio.file.Paths.get(
+    System.getenv('NXF_TEMP') ?:
+      System.getenv('VIASH_TEMP') ?: 
+      System.getenv('TEMPDIR') ?: 
+      System.getenv('TMPDIR') ?: 
+      '/tmp'
+  ).toAbsolutePath()
+
+  mount_temp {
+    docker.temp            = tempDir
+    podman.temp            = tempDir
+    charliecloud.temp      = tempDir
+  }
+
+  no_publish {
+    process {
+      withName: '.*' {
+        publishDir = [
+          enabled: false
+        ]
+      }
+    }
+  }
+
+  docker {
+    docker.fixOwnership    = true
+    docker.enabled         = true
+    // docker.userEmulation   = true
+    singularity.enabled    = false
+    podman.enabled         = false
+    shifter.enabled        = false
+    charliecloud.enabled   = false
+  }
+
+  local {
+    // This config is for local processing.
+    process {
+        withName: ".*parallel_map_process" {
+          maxForks = 1
+        }
+        maxMemory = 25.GB
+        withLabel: verylowcpu { cpus = 2 }
+        withLabel: lowcpu { cpus = 4 }
+        withLabel: midcpu { cpus = 6 }
+        withLabel: highcpu { cpus = 8 }
+  
+        withLabel: lowmem { memory = { get_memory( 8.GB * task.attempt ) } }
+        withLabel: midmem { memory = { get_memory( 12.GB * task.attempt ) } }
+        withLabel: highmem { memory = { get_memory( 20.GB * task.attempt ) } }
+    }
+  }
+}
+
+def get_memory(to_compare) {
+    if (!process.containsKey("maxMemory") || !process.maxMemory) {
+      return to_compare
+    }
+
+    try {
+      if (process.containsKey("maxRetries") && process.maxRetries && task.attempt == (process.maxRetries as int)) {
+        return process.maxMemory
+      }
+      else if (to_compare.compareTo(process.maxMemory as nextflow.util.MemoryUnit) == 1) {
+        return max_memory as nextflow.util.MemoryUnit
+      }
+      else {
+        return to_compare
+      }  
+    } catch (all) {
+          println "Error processing memory resources. Please check that process.maxMemory '${process.maxMemory}' and process.maxRetries '${process.maxRetries}' are valid!"
+          System.exit(1)
+    }
+  }
--- a/target/executable/eset/create_pdata/.config.vsh.yaml
+++ b/target/executable/eset/create_pdata/.config.vsh.yaml
@@ -0,0 +1,225 @@
+name: "create_pdata"
+namespace: "eset"
+version: "fix_20"
+authors:
+- name: "Dries Schaumont"
+  roles:
+  - "maintainer"
+  info:
+    links:
+      email: "dries@data-intuitive.com"
+      github: "DriesSchaumont"
+      orcid: "0000-0002-4389-0440"
+      linkedin: "dries-schaumont"
+    organizations:
+    - name: "Data Intuitive"
+      href: "https://www.data-intuitive.com"
+      role: "Data Scientist"
+- name: "Marijke Van Moerbeke"
+  roles:
+  - "contributor"
+  info:
+    links:
+      github: "mvanmoerbeke"
+      orcid: "0000-0002-3097-5621"
+      linkedin: "marijke-van-moerbeke-84303a34"
+    organizations:
+    - name: "OpenAnalytics"
+      href: "https://www.openanalytics.eu"
+      role: "Statistical Consultant"
+argument_groups:
+- name: "Arguments"
+  arguments:
+  - type: "file"
+    name: "--star_stats_file"
+    description: "Tab-delimited text file containing statistics (per column) that\
+      \ were generated\nfrom the STAR log files (Log.final.out, Summary.csv, ReadsPerGene.out.tab).\n\
+      Each entry (row) in the file describes the values for one well (barcode).\n"
+    info: null
+    must_exist: true
+    create_parent: true
+    required: true
+    direction: "input"
+    multiple: false
+    multiple_sep: ";"
+  - type: "file"
+    name: "--nrReadsNrGenesPerChromPool"
+    description: "Pivot table in tsv format of the combined nrReadsNrGenesPerChrom\
+      \ files from STAR. \nDescribes per chromosome (as columns) the number of reads,\
+      \ as well as the total number \nof reads per cell barcode and the percentage\
+      \ of nuclear, ERCC and mitochondrial\nreads.\n"
+    info: null
+    must_exist: true
+    create_parent: true
+    required: true
+    direction: "input"
+    multiple: false
+    multiple_sep: ";"
+  - type: "file"
+    name: "--output"
+    info: null
+    default:
+    - "pData.$id.txt"
+    must_exist: true
+    create_parent: true
+    required: false
+    direction: "output"
+    multiple: false
+    multiple_sep: ";"
+resources:
+- type: "python_script"
+  path: "create_pdata.py"
+  is_executable: true
+- type: "file"
+  path: "nextflow_labels.config"
+  dest: "nextflow_labels.config"
+description: "Create a pdata file by combining the mapping statistics \n"
+test_resources:
+- type: "python_script"
+  path: "test.py"
+  is_executable: true
+- type: "file"
+  path: "nrReadsNrGenesPerChromPool.txt"
+- type: "file"
+  path: "starLogs.txt"
+info: null
+status: "enabled"
+requirements:
+  commands:
+  - "ps"
+license: "MIT"
+links:
+  repository: "https://github.com/viash-hub/htrnaseq"
+runners:
+- type: "executable"
+  id: "executable"
+  docker_setup_strategy: "ifneedbepullelsecachedbuild"
+- type: "nextflow"
+  id: "nextflow"
+  directives:
+    tag: "$id"
+  auto:
+    simplifyInput: true
+    simplifyOutput: false
+    transcript: false
+    publish: false
+  config:
+    labels:
+      mem1gb: "memory = 1000000000.B"
+      mem2gb: "memory = 2000000000.B"
+      mem5gb: "memory = 5000000000.B"
+      mem10gb: "memory = 10000000000.B"
+      mem20gb: "memory = 20000000000.B"
+      mem50gb: "memory = 50000000000.B"
+      mem100gb: "memory = 100000000000.B"
+      mem200gb: "memory = 200000000000.B"
+      mem500gb: "memory = 500000000000.B"
+      mem1tb: "memory = 1000000000000.B"
+      mem2tb: "memory = 2000000000000.B"
+      mem5tb: "memory = 5000000000000.B"
+      mem10tb: "memory = 10000000000000.B"
+      mem20tb: "memory = 20000000000000.B"
+      mem50tb: "memory = 50000000000000.B"
+      mem100tb: "memory = 100000000000000.B"
+      mem200tb: "memory = 200000000000000.B"
+      mem500tb: "memory = 500000000000000.B"
+      mem1gib: "memory = 1073741824.B"
+      mem2gib: "memory = 2147483648.B"
+      mem4gib: "memory = 4294967296.B"
+      mem8gib: "memory = 8589934592.B"
+      mem16gib: "memory = 17179869184.B"
+      mem32gib: "memory = 34359738368.B"
+      mem64gib: "memory = 68719476736.B"
+      mem128gib: "memory = 137438953472.B"
+      mem256gib: "memory = 274877906944.B"
+      mem512gib: "memory = 549755813888.B"
+      mem1tib: "memory = 1099511627776.B"
+      mem2tib: "memory = 2199023255552.B"
+      mem4tib: "memory = 4398046511104.B"
+      mem8tib: "memory = 8796093022208.B"
+      mem16tib: "memory = 17592186044416.B"
+      mem32tib: "memory = 35184372088832.B"
+      mem64tib: "memory = 70368744177664.B"
+      mem128tib: "memory = 140737488355328.B"
+      mem256tib: "memory = 281474976710656.B"
+      mem512tib: "memory = 562949953421312.B"
+      cpu1: "cpus = 1"
+      cpu2: "cpus = 2"
+      cpu5: "cpus = 5"
+      cpu10: "cpus = 10"
+      cpu20: "cpus = 20"
+      cpu50: "cpus = 50"
+      cpu100: "cpus = 100"
+      cpu200: "cpus = 200"
+      cpu500: "cpus = 500"
+      cpu1000: "cpus = 1000"
+    script:
+    - "includeConfig(\"nextflow_labels.config\")"
+  debug: false
+  container: "docker"
+engines:
+- type: "docker"
+  id: "docker"
+  image: "python:3.12-slim"
+  target_registry: "images.viash-hub.com"
+  target_tag: "fix_20"
+  namespace_separator: "/"
+  setup:
+  - type: "apt"
+    packages:
+    - "procps"
+    interactive: false
+  - type: "python"
+    user: false
+    packages:
+    - "pandas"
+    upgrade: true
+  test_setup:
+  - type: "python"
+    user: false
+    packages:
+    - "viashpy"
+    upgrade: true
+  entrypoint: []
+  cmd: null
+- type: "native"
+  id: "native"
+build_info:
+  config: "src/eset/create_pdata/config.vsh.yaml"
+  runner: "executable"
+  engine: "docker|native"
+  output: "target/executable/eset/create_pdata"
+  executable: "target/executable/eset/create_pdata/create_pdata"
+  viash_version: "0.9.0"
+  git_commit: "7ae6be67bd6159c27c4b50f0b2c1a56ec92b60c4"
+  git_remote: "https://x-access-token:ghs_HTVAxaMSg3PwvfZSHmJuMhxtV6WV4O2VfvsR@github.com/viash-hub/htrnaseq"
+package_config:
+  name: "htrnaseq"
+  version: "fix_20"
+  description: "High-throughput pipeline [WIP]\n"
+  info:
+    test_resources:
+    - path: "gs://viash-hub-test-data/htrnaseq/v1/"
+      dest: "resources_test"
+  viash_version: "0.9.0"
+  source: "src"
+  target: "target"
+  config_mods:
+  - ".requirements.commands := ['ps']\n.runners[.type == 'nextflow'].config.script\
+    \ := 'includeConfig(\"nextflow_labels.config\")'\n.resources += {path: '/src/config/labels.config',\
+    \ dest: 'nextflow_labels.config'}\n"
+  - ".engines += { type: \"native\" }"
+  - ".engines[.type == 'docker'].target_registry := 'images.viash-hub.com'"
+  - ".engines[.type == 'docker'].target_tag := 'fix_20'"
+  keywords:
+  - "bioinformatics"
+  - "sequence"
+  - "high-throughput"
+  - "mapping"
+  - "counting"
+  - "pipeline"
+  license: "MIT"
+  organization: "vsh"
+  links:
+    repository: "https://github.com/viash-hub/htrnaseq"
+    issue_tracker: "https://github.com/viash-hub/htrnaseq/issues"
--- a/target/executable/eset/create_pdata/create_pdata
+++ b/target/executable/eset/create_pdata/create_pdata
--- a/target/executable/eset/create_pdata/nextflow_labels.config
+++ b/target/executable/eset/create_pdata/nextflow_labels.config
@@ -0,0 +1,108 @@
+executor {
+  $k8s {
+    submitRateLimit = '10sec'
+    pollInterval = '1 sec'
+  }
+}
+
+process {
+  container = 'nextflow/bash:latest'
+  
+  // default resources
+  memory = { 8.Gb * task.attempt }
+  cpus = 8
+  maxForks = 36
+
+  // Retry for exit codes that have something to do with memory issues
+  errorStrategy = { task.exitStatus in 137..140 ? 'retry' : 'terminate' }
+  maxRetries = 3
+  maxMemory = 192.GB
+
+  // Resource labels
+  withLabel: verylowcpu { cpus = 2 }
+  withLabel: lowcpu { cpus = 8 }
+  withLabel: midcpu { cpus = 16 }
+  withLabel: highcpu { cpus = 32 }
+  
+  withLabel: verylowmem { memory = { get_memory( 4.GB * task.attempt ) } }
+  withLabel: lowmem { memory = { get_memory( 8.GB * task.attempt ) } }
+  withLabel: midmem { memory = { get_memory( 16.GB * task.attempt ) } }
+  withLabel: highmem { memory = { get_memory( 64.GB * task.attempt ) } }
+
+}
+
+profiles {
+  // detect tempdir
+  tempDir = java.nio.file.Paths.get(
+    System.getenv('NXF_TEMP') ?:
+      System.getenv('VIASH_TEMP') ?: 
+      System.getenv('TEMPDIR') ?: 
+      System.getenv('TMPDIR') ?: 
+      '/tmp'
+  ).toAbsolutePath()
+
+  mount_temp {
+    docker.temp            = tempDir
+    podman.temp            = tempDir
+    charliecloud.temp      = tempDir
+  }
+
+  no_publish {
+    process {
+      withName: '.*' {
+        publishDir = [
+          enabled: false
+        ]
+      }
+    }
+  }
+
+  docker {
+    docker.fixOwnership    = true
+    docker.enabled         = true
+    // docker.userEmulation   = true
+    singularity.enabled    = false
+    podman.enabled         = false
+    shifter.enabled        = false
+    charliecloud.enabled   = false
+  }
+
+  local {
+    // This config is for local processing.
+    process {
+        withName: ".*parallel_map_process" {
+          maxForks = 1
+        }
+        maxMemory = 25.GB
+        withLabel: verylowcpu { cpus = 2 }
+        withLabel: lowcpu { cpus = 4 }
+        withLabel: midcpu { cpus = 6 }
+        withLabel: highcpu { cpus = 8 }
+  
+        withLabel: lowmem { memory = { get_memory( 8.GB * task.attempt ) } }
+        withLabel: midmem { memory = { get_memory( 12.GB * task.attempt ) } }
+        withLabel: highmem { memory = { get_memory( 20.GB * task.attempt ) } }
+    }
+  }
+}
+
+def get_memory(to_compare) {
+    if (!process.containsKey("maxMemory") || !process.maxMemory) {
+      return to_compare
+    }
+
+    try {
+      if (process.containsKey("maxRetries") && process.maxRetries && task.attempt == (process.maxRetries as int)) {
+        return process.maxMemory
+      }
+      else if (to_compare.compareTo(process.maxMemory as nextflow.util.MemoryUnit) == 1) {
+        return max_memory as nextflow.util.MemoryUnit
+      }
+      else {
+        return to_compare
+      }  
+    } catch (all) {
+          println "Error processing memory resources. Please check that process.maxMemory '${process.maxMemory}' and process.maxRetries '${process.maxRetries}' are valid!"
+          System.exit(1)
+    }
+  }
--- a/target/executable/integration_test_components/htrnaseq/check_eset/.config.vsh.yaml
+++ b/target/executable/integration_test_components/htrnaseq/check_eset/.config.vsh.yaml
@@ -0,0 +1,182 @@
+name: "check_eset"
+namespace: "integration_test_components/htrnaseq"
+version: "fix_20"
+authors:
+- name: "Dries Schaumont"
+  roles:
+  - "author"
+  - "maintainer"
+  info:
+    links:
+      email: "dries@data-intuitive.com"
+      github: "DriesSchaumont"
+      orcid: "0000-0002-4389-0440"
+      linkedin: "dries-schaumont"
+    organizations:
+    - name: "Data Intuitive"
+      href: "https://www.data-intuitive.com"
+      role: "Data Scientist"
+argument_groups:
+- name: "Inputs"
+  arguments:
+  - type: "file"
+    name: "--eset"
+    description: "Path to an ExpressionSet object."
+    info: null
+    example:
+    - "eset.rds"
+    must_exist: true
+    create_parent: true
+    required: true
+    direction: "input"
+    multiple: false
+    multiple_sep: ";"
+  - type: "file"
+    name: "--star_output"
+    info: null
+    must_exist: true
+    create_parent: true
+    required: true
+    direction: "input"
+    multiple: true
+    multiple_sep: ";"
+resources:
+- type: "r_script"
+  path: "script.R"
+  is_executable: true
+- type: "file"
+  path: "nextflow_labels.config"
+  dest: "nextflow_labels.config"
+description: "This component test the ExpressionSet object as output by the main pipeline."
+info: null
+status: "enabled"
+requirements:
+  commands:
+  - "ps"
+license: "MIT"
+links:
+  repository: "https://github.com/viash-hub/htrnaseq"
+runners:
+- type: "executable"
+  id: "executable"
+  docker_setup_strategy: "ifneedbepullelsecachedbuild"
+- type: "nextflow"
+  id: "nextflow"
+  directives:
+    tag: "$id"
+  auto:
+    simplifyInput: true
+    simplifyOutput: false
+    transcript: false
+    publish: false
+  config:
+    labels:
+      mem1gb: "memory = 1000000000.B"
+      mem2gb: "memory = 2000000000.B"
+      mem5gb: "memory = 5000000000.B"
+      mem10gb: "memory = 10000000000.B"
+      mem20gb: "memory = 20000000000.B"
+      mem50gb: "memory = 50000000000.B"
+      mem100gb: "memory = 100000000000.B"
+      mem200gb: "memory = 200000000000.B"
+      mem500gb: "memory = 500000000000.B"
+      mem1tb: "memory = 1000000000000.B"
+      mem2tb: "memory = 2000000000000.B"
+      mem5tb: "memory = 5000000000000.B"
+      mem10tb: "memory = 10000000000000.B"
+      mem20tb: "memory = 20000000000000.B"
+      mem50tb: "memory = 50000000000000.B"
+      mem100tb: "memory = 100000000000000.B"
+      mem200tb: "memory = 200000000000000.B"
+      mem500tb: "memory = 500000000000000.B"
+      mem1gib: "memory = 1073741824.B"
+      mem2gib: "memory = 2147483648.B"
+      mem4gib: "memory = 4294967296.B"
+      mem8gib: "memory = 8589934592.B"
+      mem16gib: "memory = 17179869184.B"
+      mem32gib: "memory = 34359738368.B"
+      mem64gib: "memory = 68719476736.B"
+      mem128gib: "memory = 137438953472.B"
+      mem256gib: "memory = 274877906944.B"
+      mem512gib: "memory = 549755813888.B"
+      mem1tib: "memory = 1099511627776.B"
+      mem2tib: "memory = 2199023255552.B"
+      mem4tib: "memory = 4398046511104.B"
+      mem8tib: "memory = 8796093022208.B"
+      mem16tib: "memory = 17592186044416.B"
+      mem32tib: "memory = 35184372088832.B"
+      mem64tib: "memory = 70368744177664.B"
+      mem128tib: "memory = 140737488355328.B"
+      mem256tib: "memory = 281474976710656.B"
+      mem512tib: "memory = 562949953421312.B"
+      cpu1: "cpus = 1"
+      cpu2: "cpus = 2"
+      cpu5: "cpus = 5"
+      cpu10: "cpus = 10"
+      cpu20: "cpus = 20"
+      cpu50: "cpus = 50"
+      cpu100: "cpus = 100"
+      cpu200: "cpus = 200"
+      cpu500: "cpus = 500"
+      cpu1000: "cpus = 1000"
+    script:
+    - "includeConfig(\"nextflow_labels.config\")"
+  debug: false
+  container: "docker"
+engines:
+- type: "docker"
+  id: "docker"
+  image: "bioconductor/bioconductor_docker:3.19"
+  target_registry: "images.viash-hub.com"
+  target_tag: "fix_20"
+  namespace_separator: "/"
+  setup:
+  - type: "r"
+    cran:
+    - "bit64"
+    bioc:
+    - "Biobase"
+    bioc_force_install: false
+  entrypoint: []
+  cmd: null
+- type: "native"
+  id: "native"
+build_info:
+  config: "src/integration_test_components/htrnaseq/check_eset/config.vsh.yaml"
+  runner: "executable"
+  engine: "docker|native"
+  output: "target/executable/integration_test_components/htrnaseq/check_eset"
+  executable: "target/executable/integration_test_components/htrnaseq/check_eset/check_eset"
+  viash_version: "0.9.0"
+  git_commit: "7ae6be67bd6159c27c4b50f0b2c1a56ec92b60c4"
+  git_remote: "https://x-access-token:ghs_HTVAxaMSg3PwvfZSHmJuMhxtV6WV4O2VfvsR@github.com/viash-hub/htrnaseq"
+package_config:
+  name: "htrnaseq"
+  version: "fix_20"
+  description: "High-throughput pipeline [WIP]\n"
+  info:
+    test_resources:
+    - path: "gs://viash-hub-test-data/htrnaseq/v1/"
+      dest: "resources_test"
+  viash_version: "0.9.0"
+  source: "src"
+  target: "target"
+  config_mods:
+  - ".requirements.commands := ['ps']\n.runners[.type == 'nextflow'].config.script\
+    \ := 'includeConfig(\"nextflow_labels.config\")'\n.resources += {path: '/src/config/labels.config',\
+    \ dest: 'nextflow_labels.config'}\n"
+  - ".engines += { type: \"native\" }"
+  - ".engines[.type == 'docker'].target_registry := 'images.viash-hub.com'"
+  - ".engines[.type == 'docker'].target_tag := 'fix_20'"
+  keywords:
+  - "bioinformatics"
+  - "sequence"
+  - "high-throughput"
+  - "mapping"
+  - "counting"
+  - "pipeline"
+  license: "MIT"
+  organization: "vsh"
+  links:
+    repository: "https://github.com/viash-hub/htrnaseq"
+    issue_tracker: "https://github.com/viash-hub/htrnaseq/issues"
--- a/target/executable/integration_test_components/htrnaseq/check_eset/check_eset
+++ b/target/executable/integration_test_components/htrnaseq/check_eset/check_eset
--- a/target/executable/integration_test_components/htrnaseq/check_eset/nextflow_labels.config
+++ b/target/executable/integration_test_components/htrnaseq/check_eset/nextflow_labels.config
@@ -0,0 +1,108 @@
+executor {
+  $k8s {
+    submitRateLimit = '10sec'
+    pollInterval = '1 sec'
+  }
+}
+
+process {
+  container = 'nextflow/bash:latest'
+  
+  // default resources
+  memory = { 8.Gb * task.attempt }
+  cpus = 8
+  maxForks = 36
+
+  // Retry for exit codes that have something to do with memory issues
+  errorStrategy = { task.exitStatus in 137..140 ? 'retry' : 'terminate' }
+  maxRetries = 3
+  maxMemory = 192.GB
+
+  // Resource labels
+  withLabel: verylowcpu { cpus = 2 }
+  withLabel: lowcpu { cpus = 8 }
+  withLabel: midcpu { cpus = 16 }
+  withLabel: highcpu { cpus = 32 }
+  
+  withLabel: verylowmem { memory = { get_memory( 4.GB * task.attempt ) } }
+  withLabel: lowmem { memory = { get_memory( 8.GB * task.attempt ) } }
+  withLabel: midmem { memory = { get_memory( 16.GB * task.attempt ) } }
+  withLabel: highmem { memory = { get_memory( 64.GB * task.attempt ) } }
+
+}
+
+profiles {
+  // detect tempdir
+  tempDir = java.nio.file.Paths.get(
+    System.getenv('NXF_TEMP') ?:
+      System.getenv('VIASH_TEMP') ?: 
+      System.getenv('TEMPDIR') ?: 
+      System.getenv('TMPDIR') ?: 
+      '/tmp'
+  ).toAbsolutePath()
+
+  mount_temp {
+    docker.temp            = tempDir
+    podman.temp            = tempDir
+    charliecloud.temp      = tempDir
+  }
+
+  no_publish {
+    process {
+      withName: '.*' {
+        publishDir = [
+          enabled: false
+        ]
+      }
+    }
+  }
+
+  docker {
+    docker.fixOwnership    = true
+    docker.enabled         = true
+    // docker.userEmulation   = true
+    singularity.enabled    = false
+    podman.enabled         = false
+    shifter.enabled        = false
+    charliecloud.enabled   = false
+  }
+
+  local {
+    // This config is for local processing.
+    process {
+        withName: ".*parallel_map_process" {
+          maxForks = 1
+        }
+        maxMemory = 25.GB
+        withLabel: verylowcpu { cpus = 2 }
+        withLabel: lowcpu { cpus = 4 }
+        withLabel: midcpu { cpus = 6 }
+        withLabel: highcpu { cpus = 8 }
+  
+        withLabel: lowmem { memory = { get_memory( 8.GB * task.attempt ) } }
+        withLabel: midmem { memory = { get_memory( 12.GB * task.attempt ) } }
+        withLabel: highmem { memory = { get_memory( 20.GB * task.attempt ) } }
+    }
+  }
+}
+
+def get_memory(to_compare) {
+    if (!process.containsKey("maxMemory") || !process.maxMemory) {
+      return to_compare
+    }
+
+    try {
+      if (process.containsKey("maxRetries") && process.maxRetries && task.attempt == (process.maxRetries as int)) {
+        return process.maxMemory
+      }
+      else if (to_compare.compareTo(process.maxMemory as nextflow.util.MemoryUnit) == 1) {
+        return max_memory as nextflow.util.MemoryUnit
+      }
+      else {
+        return to_compare
+      }  
+    } catch (all) {
+          println "Error processing memory resources. Please check that process.maxMemory '${process.maxMemory}' and process.maxRetries '${process.maxRetries}' are valid!"
+          System.exit(1)
+    }
+  }
--- a/target/executable/parallel_map/.config.vsh.yaml
+++ b/target/executable/parallel_map/.config.vsh.yaml
@@ -0,0 +1,298 @@
+name: "parallel_map"
+version: "fix_20"
+authors:
+- name: "Dries Schaumont"
+  roles:
+  - "maintainer"
+  info:
+    links:
+      email: "dries@data-intuitive.com"
+      github: "DriesSchaumont"
+      orcid: "0000-0002-4389-0440"
+      linkedin: "dries-schaumont"
+    organizations:
+    - name: "Data Intuitive"
+      href: "https://www.data-intuitive.com"
+      role: "Data Scientist"
+- name: "Toni Verbeiren"
+  roles:
+  - "author"
+  - "maintainer"
+  info:
+    role: "Core Team Member"
+    links:
+      github: "tverbeiren"
+      linkedin: "verbeiren"
+    organizations:
+    - name: "Data Intuitive"
+      href: "https://www.data-intuitive.com"
+      role: "Data Scientist and CEO"
+argument_groups:
+- name: "Input arguments"
+  arguments:
+  - type: "file"
+    name: "--input_r1"
+    info: null
+    must_exist: true
+    create_parent: true
+    required: true
+    direction: "input"
+    multiple: true
+    multiple_sep: ";"
+  - type: "file"
+    name: "--input_r2"
+    info: null
+    must_exist: true
+    create_parent: true
+    required: true
+    direction: "input"
+    multiple: true
+    multiple_sep: ";"
+  - type: "file"
+    name: "--genomeDir"
+    description: "STAR reference directory"
+    info: null
+    must_exist: true
+    create_parent: true
+    required: true
+    direction: "input"
+    multiple: false
+    multiple_sep: ";"
+  - type: "string"
+    name: "--barcodes"
+    description: "The barcodes/wells to process"
+    info: null
+    required: true
+    direction: "input"
+    multiple: true
+    multiple_sep: ";"
+- name: "Barcode arguments"
+  arguments:
+  - type: "integer"
+    name: "--wellBarcodesLength"
+    description: "The length of the well barcodes"
+    info: null
+    required: true
+    direction: "input"
+    multiple: false
+    multiple_sep: ";"
+  - type: "integer"
+    name: "--umiLength"
+    description: "The length of the UMIs"
+    info: null
+    required: true
+    direction: "input"
+    multiple: false
+    multiple_sep: ";"
+  - type: "string"
+    name: "--limitBAMsortRAM"
+    info: null
+    default:
+    - "10000000000"
+    required: false
+    direction: "input"
+    multiple: false
+    multiple_sep: ";"
+- name: "Runtime arguments"
+  arguments:
+  - type: "integer"
+    name: "--runThreadN"
+    description: "Number of threads to use for a single STAR execution."
+    info: null
+    default:
+    - 1
+    required: false
+    direction: "input"
+    multiple: false
+    multiple_sep: ";"
+- name: "Output arguments"
+  arguments:
+  - type: "file"
+    name: "--output"
+    description: "Location of the output folders, 1 folder per barcode. The value\
+      \ used\nfor this argument must contain a '*', which will be replaced with the\n\
+      barcode to form the final output location for that barcode.\n"
+    info: null
+    default:
+    - "./*"
+    must_exist: true
+    create_parent: true
+    required: true
+    direction: "output"
+    multiple: true
+    multiple_sep: ";"
+  - type: "file"
+    name: "--joblog"
+    description: "Where to store the log file listing all the jobs."
+    info: null
+    default:
+    - "execution_log.txt"
+    must_exist: true
+    create_parent: true
+    required: false
+    direction: "output"
+    multiple: false
+    multiple_sep: ";"
+resources:
+- type: "bash_script"
+  path: "script.sh"
+  is_executable: true
+- type: "file"
+  path: "STAR"
+- type: "file"
+  path: "nextflow_labels.config"
+  dest: "nextflow_labels.config"
+description: "Map wells in batch, using STAR\nSpliced Transcripts Alignment to a Reference\
+  \ (C) Alexander Dobin\nhttps://github.com/alexdobin/STAR\n"
+test_resources:
+- type: "bash_script"
+  path: "test.sh"
+  is_executable: true
+info: null
+status: "enabled"
+requirements:
+  commands:
+  - "ps"
+license: "MIT"
+links:
+  repository: "https://github.com/viash-hub/htrnaseq"
+runners:
+- type: "executable"
+  id: "executable"
+  docker_setup_strategy: "ifneedbepullelsecachedbuild"
+- type: "nextflow"
+  id: "nextflow"
+  directives:
+    tag: "$id"
+  auto:
+    simplifyInput: true
+    simplifyOutput: false
+    transcript: false
+    publish: false
+  config:
+    labels:
+      mem1gb: "memory = 1000000000.B"
+      mem2gb: "memory = 2000000000.B"
+      mem5gb: "memory = 5000000000.B"
+      mem10gb: "memory = 10000000000.B"
+      mem20gb: "memory = 20000000000.B"
+      mem50gb: "memory = 50000000000.B"
+      mem100gb: "memory = 100000000000.B"
+      mem200gb: "memory = 200000000000.B"
+      mem500gb: "memory = 500000000000.B"
+      mem1tb: "memory = 1000000000000.B"
+      mem2tb: "memory = 2000000000000.B"
+      mem5tb: "memory = 5000000000000.B"
+      mem10tb: "memory = 10000000000000.B"
+      mem20tb: "memory = 20000000000000.B"
+      mem50tb: "memory = 50000000000000.B"
+      mem100tb: "memory = 100000000000000.B"
+      mem200tb: "memory = 200000000000000.B"
+      mem500tb: "memory = 500000000000000.B"
+      mem1gib: "memory = 1073741824.B"
+      mem2gib: "memory = 2147483648.B"
+      mem4gib: "memory = 4294967296.B"
+      mem8gib: "memory = 8589934592.B"
+      mem16gib: "memory = 17179869184.B"
+      mem32gib: "memory = 34359738368.B"
+      mem64gib: "memory = 68719476736.B"
+      mem128gib: "memory = 137438953472.B"
+      mem256gib: "memory = 274877906944.B"
+      mem512gib: "memory = 549755813888.B"
+      mem1tib: "memory = 1099511627776.B"
+      mem2tib: "memory = 2199023255552.B"
+      mem4tib: "memory = 4398046511104.B"
+      mem8tib: "memory = 8796093022208.B"
+      mem16tib: "memory = 17592186044416.B"
+      mem32tib: "memory = 35184372088832.B"
+      mem64tib: "memory = 70368744177664.B"
+      mem128tib: "memory = 140737488355328.B"
+      mem256tib: "memory = 281474976710656.B"
+      mem512tib: "memory = 562949953421312.B"
+      cpu1: "cpus = 1"
+      cpu2: "cpus = 2"
+      cpu5: "cpus = 5"
+      cpu10: "cpus = 10"
+      cpu20: "cpus = 20"
+      cpu50: "cpus = 50"
+      cpu100: "cpus = 100"
+      cpu200: "cpus = 200"
+      cpu500: "cpus = 500"
+      cpu1000: "cpus = 1000"
+    script:
+    - "includeConfig(\"nextflow_labels.config\")"
+  debug: false
+  container: "docker"
+engines:
+- type: "docker"
+  id: "docker"
+  image: "debian:stable-slim"
+  target_registry: "images.viash-hub.com"
+  target_tag: "fix_20"
+  namespace_separator: "/"
+  setup:
+  - type: "apt"
+    packages:
+    - "procps"
+    - "wget"
+    - "automake"
+    - "make"
+    - "gcc"
+    - "g++"
+    - "zlib1g-dev"
+    - "parallel"
+    - "file"
+    interactive: false
+  - type: "docker"
+    copy:
+    - "STAR /usr/local/bin/$STAR_BINARY"
+    build_args:
+    - "STAR_V=2.7.6a"
+    env:
+    - "STAR_SOURCE=\"https://github.com/alexdobin/STAR/archive/refs/tags/$STAR_V.tar.gz\""
+    - "STAR_TARGET=\"/app/star-$STAR_V.tar.gz\""
+    - "STAR_INSTALL_DIR=\"/app/STAR-$STAR_V\""
+    - "STAR_BINARY=STAR"
+  entrypoint: []
+  cmd: null
+- type: "native"
+  id: "native"
+build_info:
+  config: "src/parallel_map/config.vsh.yaml"
+  runner: "executable"
+  engine: "docker|native"
+  output: "target/executable/parallel_map"
+  executable: "target/executable/parallel_map/parallel_map"
+  viash_version: "0.9.0"
+  git_commit: "7ae6be67bd6159c27c4b50f0b2c1a56ec92b60c4"
+  git_remote: "https://x-access-token:ghs_HTVAxaMSg3PwvfZSHmJuMhxtV6WV4O2VfvsR@github.com/viash-hub/htrnaseq"
+package_config:
+  name: "htrnaseq"
+  version: "fix_20"
+  description: "High-throughput pipeline [WIP]\n"
+  info:
+    test_resources:
+    - path: "gs://viash-hub-test-data/htrnaseq/v1/"
+      dest: "resources_test"
+  viash_version: "0.9.0"
+  source: "src"
+  target: "target"
+  config_mods:
+  - ".requirements.commands := ['ps']\n.runners[.type == 'nextflow'].config.script\
+    \ := 'includeConfig(\"nextflow_labels.config\")'\n.resources += {path: '/src/config/labels.config',\
+    \ dest: 'nextflow_labels.config'}\n"
+  - ".engines += { type: \"native\" }"
+  - ".engines[.type == 'docker'].target_registry := 'images.viash-hub.com'"
+  - ".engines[.type == 'docker'].target_tag := 'fix_20'"
+  keywords:
+  - "bioinformatics"
+  - "sequence"
+  - "high-throughput"
+  - "mapping"
+  - "counting"
+  - "pipeline"
+  license: "MIT"
+  organization: "vsh"
+  links:
+    repository: "https://github.com/viash-hub/htrnaseq"
+    issue_tracker: "https://github.com/viash-hub/htrnaseq/issues"
--- a/target/executable/parallel_map/STAR
+++ b/target/executable/parallel_map/STAR
--- a/target/executable/parallel_map/nextflow_labels.config
+++ b/target/executable/parallel_map/nextflow_labels.config
@@ -0,0 +1,108 @@
+executor {
+  $k8s {
+    submitRateLimit = '10sec'
+    pollInterval = '1 sec'
+  }
+}
+
+process {
+  container = 'nextflow/bash:latest'
+  
+  // default resources
+  memory = { 8.Gb * task.attempt }
+  cpus = 8
+  maxForks = 36
+
+  // Retry for exit codes that have something to do with memory issues
+  errorStrategy = { task.exitStatus in 137..140 ? 'retry' : 'terminate' }
+  maxRetries = 3
+  maxMemory = 192.GB
+
+  // Resource labels
+  withLabel: verylowcpu { cpus = 2 }
+  withLabel: lowcpu { cpus = 8 }
+  withLabel: midcpu { cpus = 16 }
+  withLabel: highcpu { cpus = 32 }
+  
+  withLabel: verylowmem { memory = { get_memory( 4.GB * task.attempt ) } }
+  withLabel: lowmem { memory = { get_memory( 8.GB * task.attempt ) } }
+  withLabel: midmem { memory = { get_memory( 16.GB * task.attempt ) } }
+  withLabel: highmem { memory = { get_memory( 64.GB * task.attempt ) } }
+
+}
+
+profiles {
+  // detect tempdir
+  tempDir = java.nio.file.Paths.get(
+    System.getenv('NXF_TEMP') ?:
+      System.getenv('VIASH_TEMP') ?: 
+      System.getenv('TEMPDIR') ?: 
+      System.getenv('TMPDIR') ?: 
+      '/tmp'
+  ).toAbsolutePath()
+
+  mount_temp {
+    docker.temp            = tempDir
+    podman.temp            = tempDir
+    charliecloud.temp      = tempDir
+  }
+
+  no_publish {
+    process {
+      withName: '.*' {
+        publishDir = [
+          enabled: false
+        ]
+      }
+    }
+  }
+
+  docker {
+    docker.fixOwnership    = true
+    docker.enabled         = true
+    // docker.userEmulation   = true
+    singularity.enabled    = false
+    podman.enabled         = false
+    shifter.enabled        = false
+    charliecloud.enabled   = false
+  }
+
+  local {
+    // This config is for local processing.
+    process {
+        withName: ".*parallel_map_process" {
+          maxForks = 1
+        }
+        maxMemory = 25.GB
+        withLabel: verylowcpu { cpus = 2 }
+        withLabel: lowcpu { cpus = 4 }
+        withLabel: midcpu { cpus = 6 }
+        withLabel: highcpu { cpus = 8 }
+  
+        withLabel: lowmem { memory = { get_memory( 8.GB * task.attempt ) } }
+        withLabel: midmem { memory = { get_memory( 12.GB * task.attempt ) } }
+        withLabel: highmem { memory = { get_memory( 20.GB * task.attempt ) } }
+    }
+  }
+}
+
+def get_memory(to_compare) {
+    if (!process.containsKey("maxMemory") || !process.maxMemory) {
+      return to_compare
+    }
+
+    try {
+      if (process.containsKey("maxRetries") && process.maxRetries && task.attempt == (process.maxRetries as int)) {
+        return process.maxMemory
+      }
+      else if (to_compare.compareTo(process.maxMemory as nextflow.util.MemoryUnit) == 1) {
+        return max_memory as nextflow.util.MemoryUnit
+      }
+      else {
+        return to_compare
+      }  
+    } catch (all) {
+          println "Error processing memory resources. Please check that process.maxMemory '${process.maxMemory}' and process.maxRetries '${process.maxRetries}' are valid!"
+          System.exit(1)
+    }
+  }
--- a/target/executable/parallel_map/parallel_map
+++ b/target/executable/parallel_map/parallel_map
--- a/target/executable/report/create_report/.config.vsh.yaml
+++ b/target/executable/report/create_report/.config.vsh.yaml
@@ -0,0 +1,235 @@
+name: "create_report"
+namespace: "report"
+version: "fix_20"
+authors:
+- name: "Dries Schaumont"
+  roles:
+  - "maintainer"
+  info:
+    links:
+      email: "dries@data-intuitive.com"
+      github: "DriesSchaumont"
+      orcid: "0000-0002-4389-0440"
+      linkedin: "dries-schaumont"
+    organizations:
+    - name: "Data Intuitive"
+      href: "https://www.data-intuitive.com"
+      role: "Data Scientist"
+- name: "Marijke Van Moerbeke"
+  roles:
+  - "author"
+  - "maintainer"
+  info:
+    links:
+      github: "mvanmoerbeke"
+      orcid: "0000-0002-3097-5621"
+      linkedin: "marijke-van-moerbeke-84303a34"
+    organizations:
+    - name: "OpenAnalytics"
+      href: "https://www.openanalytics.eu"
+      role: "Statistical Consultant"
+argument_groups:
+- name: "Arguments"
+  arguments:
+  - type: "file"
+    name: "--eset"
+    info: null
+    must_exist: true
+    create_parent: true
+    required: true
+    direction: "input"
+    multiple: true
+    multiple_sep: ";"
+  - type: "file"
+    name: "--output_report"
+    info: null
+    example:
+    - "report.html"
+    must_exist: true
+    create_parent: true
+    required: true
+    direction: "output"
+    multiple: false
+    multiple_sep: ";"
+resources:
+- type: "r_script"
+  path: "script.R"
+  is_executable: true
+- type: "r_script"
+  path: "template.Rmd"
+  is_executable: true
+- type: "r_script"
+  path: "plateLayouts.R"
+  is_executable: true
+- type: "file"
+  path: "OutputSTARsolo.png"
+- type: "file"
+  path: "nextflow_labels.config"
+  dest: "nextflow_labels.config"
+description: "Create a basic QC report in HTML format based on a number of esets.\n"
+test_resources:
+- type: "r_script"
+  path: "test.R"
+  is_executable: true
+- type: "file"
+  path: "test_data"
+info: null
+status: "enabled"
+requirements:
+  commands:
+  - "ps"
+license: "MIT"
+links:
+  repository: "https://github.com/viash-hub/htrnaseq"
+runners:
+- type: "executable"
+  id: "executable"
+  docker_setup_strategy: "ifneedbepullelsecachedbuild"
+- type: "nextflow"
+  id: "nextflow"
+  directives:
+    tag: "$id"
+  auto:
+    simplifyInput: true
+    simplifyOutput: false
+    transcript: false
+    publish: false
+  config:
+    labels:
+      mem1gb: "memory = 1000000000.B"
+      mem2gb: "memory = 2000000000.B"
+      mem5gb: "memory = 5000000000.B"
+      mem10gb: "memory = 10000000000.B"
+      mem20gb: "memory = 20000000000.B"
+      mem50gb: "memory = 50000000000.B"
+      mem100gb: "memory = 100000000000.B"
+      mem200gb: "memory = 200000000000.B"
+      mem500gb: "memory = 500000000000.B"
+      mem1tb: "memory = 1000000000000.B"
+      mem2tb: "memory = 2000000000000.B"
+      mem5tb: "memory = 5000000000000.B"
+      mem10tb: "memory = 10000000000000.B"
+      mem20tb: "memory = 20000000000000.B"
+      mem50tb: "memory = 50000000000000.B"
+      mem100tb: "memory = 100000000000000.B"
+      mem200tb: "memory = 200000000000000.B"
+      mem500tb: "memory = 500000000000000.B"
+      mem1gib: "memory = 1073741824.B"
+      mem2gib: "memory = 2147483648.B"
+      mem4gib: "memory = 4294967296.B"
+      mem8gib: "memory = 8589934592.B"
+      mem16gib: "memory = 17179869184.B"
+      mem32gib: "memory = 34359738368.B"
+      mem64gib: "memory = 68719476736.B"
+      mem128gib: "memory = 137438953472.B"
+      mem256gib: "memory = 274877906944.B"
+      mem512gib: "memory = 549755813888.B"
+      mem1tib: "memory = 1099511627776.B"
+      mem2tib: "memory = 2199023255552.B"
+      mem4tib: "memory = 4398046511104.B"
+      mem8tib: "memory = 8796093022208.B"
+      mem16tib: "memory = 17592186044416.B"
+      mem32tib: "memory = 35184372088832.B"
+      mem64tib: "memory = 70368744177664.B"
+      mem128tib: "memory = 140737488355328.B"
+      mem256tib: "memory = 281474976710656.B"
+      mem512tib: "memory = 562949953421312.B"
+      cpu1: "cpus = 1"
+      cpu2: "cpus = 2"
+      cpu5: "cpus = 5"
+      cpu10: "cpus = 10"
+      cpu20: "cpus = 20"
+      cpu50: "cpus = 50"
+      cpu100: "cpus = 100"
+      cpu200: "cpus = 200"
+      cpu500: "cpus = 500"
+      cpu1000: "cpus = 1000"
+    script:
+    - "includeConfig(\"nextflow_labels.config\")"
+  debug: false
+  container: "docker"
+engines:
+- type: "docker"
+  id: "docker"
+  image: "rocker/r2u:24.04"
+  target_registry: "images.viash-hub.com"
+  target_tag: "fix_20"
+  namespace_separator: "/"
+  setup:
+  - type: "apt"
+    packages:
+    - "procps"
+    - "pandoc"
+    interactive: false
+  - type: "r"
+    cran:
+    - "ggplot2"
+    - "knitr"
+    - "gridExtra"
+    - "RColorBrewer"
+    - "processx"
+    - "whisker"
+    - "rmarkdown"
+    - "bookdown"
+    - "data.table"
+    - "platetools"
+    - "htmltools"
+    - "DT"
+    - "logger"
+    - "bit64"
+    bioc:
+    - "Biobase"
+    - "ComplexHeatmap"
+    script:
+    - "install.packages(\"oaStyle\", repos = c(rdepot = \"https://repos.openanalytics.eu/repo/public\"\
+      , getOption(\"repos\")))"
+    bioc_force_install: false
+  test_setup:
+  - type: "r"
+    packages:
+    - "testthat"
+    - "R.utils"
+    bioc_force_install: false
+  entrypoint: []
+  cmd: null
+- type: "native"
+  id: "native"
+build_info:
+  config: "src/report/config.vsh.yaml"
+  runner: "executable"
+  engine: "docker|native"
+  output: "target/executable/report/create_report"
+  executable: "target/executable/report/create_report/create_report"
+  viash_version: "0.9.0"
+  git_commit: "7ae6be67bd6159c27c4b50f0b2c1a56ec92b60c4"
+  git_remote: "https://x-access-token:ghs_HTVAxaMSg3PwvfZSHmJuMhxtV6WV4O2VfvsR@github.com/viash-hub/htrnaseq"
+package_config:
+  name: "htrnaseq"
+  version: "fix_20"
+  description: "High-throughput pipeline [WIP]\n"
+  info:
+    test_resources:
+    - path: "gs://viash-hub-test-data/htrnaseq/v1/"
+      dest: "resources_test"
+  viash_version: "0.9.0"
+  source: "src"
+  target: "target"
+  config_mods:
+  - ".requirements.commands := ['ps']\n.runners[.type == 'nextflow'].config.script\
+    \ := 'includeConfig(\"nextflow_labels.config\")'\n.resources += {path: '/src/config/labels.config',\
+    \ dest: 'nextflow_labels.config'}\n"
+  - ".engines += { type: \"native\" }"
+  - ".engines[.type == 'docker'].target_registry := 'images.viash-hub.com'"
+  - ".engines[.type == 'docker'].target_tag := 'fix_20'"
+  keywords:
+  - "bioinformatics"
+  - "sequence"
+  - "high-throughput"
+  - "mapping"
+  - "counting"
+  - "pipeline"
+  license: "MIT"
+  organization: "vsh"
+  links:
+    repository: "https://github.com/viash-hub/htrnaseq"
+    issue_tracker: "https://github.com/viash-hub/htrnaseq/issues"
--- a/Show More
+++ b/Show More